From 2b2f90a8c902f0f8efbe0bcf08e644793cd4dee4 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Tue, 27 Sep 2022 15:42:39 +0800 Subject: [PATCH 01/34] docs: clip benchmark on zeroshot classification and retrieval tasks --- docs/user-guides/benchmark.md | 102 ++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 docs/user-guides/benchmark.md diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md new file mode 100644 index 000000000..40cfcd140 --- /dev/null +++ b/docs/user-guides/benchmark.md @@ -0,0 +1,102 @@ +# CLIP Benchmark + +## Basic statistics + +We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. + +| Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | +|---------------------------------------|-----------------|---------------------|----------------------| +| RN50::openai | 244 | 2.99 | 1.36 | +| RN50::yfcc15m | 389 | 2.86 | 1.36 | +| RN50::cc12m | 389 | 2.84 | 1.36 | +| RN101::openai | 278 | 3.05 | 1.40 | +| RN101::yfcc15m | 457 | 2.88 | 1.40 | +| RN50x4::openai | 402 | 3.23 | 1.63 | +| RN50x16::openai | 631 | 3.63 | 2.02 | +| RN50x64::openai | 1291 | 4.08 | 2.98 | +| ViT-B-32::openai | 338 | 3.20 | 1.40 | +| ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | +| ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | +| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | +| ViT-B-32::laion2B-s34B-b79K | 577 | 2.94 | 1.40 | +| ViT-B-16::openai | 335 | 3.20 | 1.44 | +| ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | +| ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | +| ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | +| ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | +| ViT-L-14::openai | 890 | 3.66 | 2.04 | +| ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | +| ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | +| ViT-L-14::laion2B-s32B-b82K | 1631 | 3.43 | 2.03 | +| ViT-L-14-336::openai | 891 | 3.74 | 2.23 | +| ViT-H-14::laion2B-s32B-b79K | 3762 | 4.45 | 3.26 | +| ViT-g-14::laion2B-s12B-b42K | 5214 | 5.16 | 4.00 | +| M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | +| M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | +| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | +| M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | + + +````{dropdown} Zero-shot retrieval: MS COCO Captions + +| model_fullname | image_retrieval_recall@5 | text_retrieval_recall@5 | +|----------------------------------|--------------------------|-------------------------| +| RN50::openai | 0.5291883349 | 0.7282000184 | +| RN50::yfcc15m | 0.3610555828 | 0.5338000059 | +| RN50::cc12m | 0.4464214444 | 0.6065999866 | +| RN101::openai | 0.5550180078 | 0.7447999716 | +| RN101::yfcc15m | 0.3760095835 | 0.5490000248 | +| RN50x4::openai | 0.5814074278 | 0.7670000196 | +| RN50x16::openai | 0.6001599431 | 0.7868000269 | +| RN50x64::openai | 0.5992003083 | 0.8033999801 | +| ViT-B-32::openai | 0.5596161485 | 0.7491999865 | +| ViT-B-32::laion400m_e31 | 0.600039959 | 0.7630000114 | +| ViT-B-32::laion400m_e32 | 0.6000000238 | 0.7645999789 | +| ViT-B-32::laion2b_e16 | 0.6468212605 | 0.7950000167 | +| ViT-B-32::laion2b_s34b_b79k | 0.6540184021 | 0.7983999848 | +| ViT-B-16::openai | 0.5842063427 | 0.7671999931 | +| ViT-B-16::laion400m_e31 | 0.6368252635 | 0.7961999774 | +| ViT-B-16::laion400m_e32 | 0.6363854408 | 0.7964000106 | +| ViT-B-16-plus-240::laion400m_e31 | 0.6604158282 | 0.8090000153 | +| ViT-B-16-plus-240::laion400m_e32 | 0.6618952155 | 0.8108000159 | +| ViT-L-14::openai | 0.610355854 | 0.793200016 | +| ViT-L-14::laion400m_e31 | 0.679688096 | 0.82099998 | +| ViT-L-14::laion400m_e32 | 0.6801279783 | 0.8212000132 | +| ViT-L-14::laion2b_s32b_b82k | 0.7109556198 | 0.8399999738 | +| ViT-L-14-336::openai | 0.6162734628 | 0.8123999834 | +| ViT-H-14::laion2b_s32b_b79k | 0.7339064479 | 0.8605999947 | +| ViT-g-14::laion2b_s12b_b42k | 0.7235905528 | 0.853399992 | + +```` + +````{dropdown} Zero-shot classification: ImageNetV2 + +| model_fullname | acc1 | acc5 | mean_per_class_recall | +|----------------------------------|--------|--------|-----------------------| +| RN50::openai | 0.5287 | 0.8148 | 0.5291 | +| RN50::yfcc15m | 0.2139 | 0.4253 | 0.2145 | +| RN50::cc12m | 0.2238 | 0.4563 | 0.2244 | +| RN101::openai | 0.5608 | 0.8314 | 0.5617 | +| RN101::yfcc15m | 0.2212 | 0.4397 | 0.2216 | +| RN50x4::openai | 0.5944 | 0.8584 | 0.5946 | +| RN50x16::openai | 0.6427 | 0.8837 | 0.643 | +| RN50x64::openai | 0.6703 | 0.907 | 0.6702 | +| ViT-B-32::openai | 0.5594 | 0.8339 | 0.5595 | +| ViT-B-32::laion400m_e31 | 0.5226 | 0.794 | 0.5233 | +| ViT-B-32::laion400m_e32 | 0.5232 | 0.7947 | 0.5235 | +| ViT-B-32::laion2b_e16 | 0.5729 | 0.8391 | 0.5737 | +| ViT-B-32::laion2b_s34b_b79k | 0.5814 | 0.8392 | 0.5808 | +| ViT-B-16::openai | 0.6186 | 0.8735 | 0.6189 | +| ViT-B-16::laion400m_e31 | 0.5942 | 0.8527 | 0.5941 | +| ViT-B-16::laion400m_e32 | 0.5965 | 0.8542 | 0.5963 | +| ViT-B-16-plus-240::laion400m_e31 | 0.6139 | 0.8631 | 0.6146 | +| ViT-B-16-plus-240::laion400m_e32 | 0.6147 | 0.8646 | 0.614 | +| ViT-L-14::openai | 0.6983 | 0.9092 | 0.6986 | +| ViT-L-14::laion400m_e31 | 0.6543 | 0.886 | 0.6547 | +| ViT-L-14::laion400m_e32 | 0.6539 | 0.8857 | 0.6543 | +| ViT-L-14::laion2b_s32b_b82k | 0.6774 | 0.9024 | 0.6783 | +| ViT-L-14-336::openai | 0.7094 | 0.9164 | 0.7094 | +| ViT-H-14::laion2b_s32b_b79k | 0.7087 | 0.9166 | 0.7091 | +| ViT-g-14::laion2b_s12b_b42k | 0.6956 | 0.9086 | 0.6962 | + +```` \ No newline at end of file From 54421cc77726accc5fcf020f8e1f4653e5caad71 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Tue, 27 Sep 2022 16:20:52 +0800 Subject: [PATCH 02/34] docs: add label --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index cf6babb65..f243fc404 100644 --- a/docs/index.md +++ b/docs/index.md @@ -178,6 +178,7 @@ It means the client and the server are now connected. Well done! user-guides/client user-guides/server user-guides/retriever +user-guides/benchmark user-guides/faq ``` From 8d445eb964b0bacaa5f1fff1000e557a7181342d Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 00:23:24 +0800 Subject: [PATCH 03/34] docs: introduction --- docs/user-guides/benchmark.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 40cfcd140..98c900f59 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -1,14 +1,17 @@ # CLIP Benchmark +In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. You can find the benchmark results in the following tables. The best results are highlighted in bold. They can be used as a guide to choose the best model for your application. + + ## Basic statistics We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | |---------------------------------------|-----------------|---------------------|----------------------| -| RN50::openai | 244 | 2.99 | 1.36 | -| RN50::yfcc15m | 389 | 2.86 | 1.36 | -| RN50::cc12m | 389 | 2.84 | 1.36 | +| RN50::openai | **244** | 2.99 | **1.36** | +| RN50::yfcc15m | 389 | 2.86 | **1.36** | +| RN50::cc12m | 389 | **2.84** | **1.36** | | RN101::openai | 278 | 3.05 | 1.40 | | RN101::yfcc15m | 457 | 2.88 | 1.40 | | RN50x4::openai | 402 | 3.23 | 1.63 | From 01a2be4d6603e1704a1a66f5ae7f914d611721e6 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 00:24:39 +0800 Subject: [PATCH 04/34] docs: open clip naming convention --- docs/user-guides/benchmark.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 98c900f59..8abdd7dae 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -21,7 +21,7 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | | ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | | ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion2B-s34B-b79K | 577 | 2.94 | 1.40 | +| ViT-B-32::laion2b-s34B-b79k | 577 | 2.94 | 1.40 | | ViT-B-16::openai | 335 | 3.20 | 1.44 | | ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | | ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | @@ -30,10 +30,10 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14::openai | 890 | 3.66 | 2.04 | | ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | | ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | -| ViT-L-14::laion2B-s32B-b82K | 1631 | 3.43 | 2.03 | +| ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | | ViT-L-14-336::openai | 891 | 3.74 | 2.23 | -| ViT-H-14::laion2B-s32B-b79K | 3762 | 4.45 | 3.26 | -| ViT-g-14::laion2B-s12B-b42K | 5214 | 5.16 | 4.00 | +| ViT-H-14::laion2b-s32B-b79k | 3762 | 4.45 | 3.26 | +| ViT-g-14::laion2b-s12B-b42k | 5214 | 5.16 | 4.00 | | M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | | M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | | M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | From 844cba17133f56292cf0381e0bc25a175a6a1355 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 00:52:05 +0800 Subject: [PATCH 05/34] fix: typo --- docs/user-guides/benchmark.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 8abdd7dae..5a917a51a 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -21,7 +21,7 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | | ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | | ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion2b-s34B-b79k | 577 | 2.94 | 1.40 | +| ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | | ViT-B-16::openai | 335 | 3.20 | 1.44 | | ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | | ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | @@ -32,8 +32,8 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | | ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | | ViT-L-14-336::openai | 891 | 3.74 | 2.23 | -| ViT-H-14::laion2b-s32B-b79k | 3762 | 4.45 | 3.26 | -| ViT-g-14::laion2b-s12B-b42k | 5214 | 5.16 | 4.00 | +| ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | +| ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | | M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | | M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | | M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | From 1e87dec751bae7a231b3dd153e9d6a629cfb6d53 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 01:32:47 +0800 Subject: [PATCH 06/34] docs: retrieval table --- docs/user-guides/benchmark.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 5a917a51a..2c65d5050 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -40,6 +40,40 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | +## Zero-shot retrieval + +| Model | COCO Caption | | Flickr 8k | | Flickr 30k | | +|----------------------------------|--------------|-------|-----------|-------|------------|-------| +| | Image | Text | Image | Text | Image | Text | +| RN101::openai | 0.555 | 0.745 | 0.523 | 0.694 | 0.415 | 0.629 | +| RN101::yfcc15m | 0.376 | 0.549 | 0.251 | 0.417 | 0.156 | 0.296 | +| RN50::cc12m | 0.446 | 0.607 | 0.302 | 0.435 | 0.204 | 0.316 | +| RN50::openai | 0.529 | 0.728 | 0.504 | 0.690 | 0.392 | 0.621 | +| RN50::yfcc15m | 0.361 | 0.534 | 0.238 | 0.394 | 0.146 | 0.278 | +| RN50x16::openai | 0.600 | 0.787 | 0.597 | 0.768 | 0.496 | 0.713 | +| RN50x4::openai | 0.581 | 0.767 | 0.558 | 0.729 | 0.451 | 0.671 | +| RN50x64::openai | 0.599 | 0.803 | 0.629 | 0.790 | 0.534 | 0.756 | +| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.620 | 0.765 | 0.506 | 0.697 | +| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.620 | 0.767 | 0.508 | 0.697 | +| ViT-B-16::openai | 0.584 | 0.767 | 0.564 | 0.727 | 0.452 | 0.671 | +| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.642 | 0.788 | 0.533 | 0.725 | +| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.644 | 0.791 | 0.535 | 0.727 | +| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.622 | 0.760 | 0.507 | 0.687 | +| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.629 | 0.778 | 0.513 | 0.694 | +| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.562 | 0.736 | 0.438 | 0.633 | +| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.562 | 0.736 | 0.437 | 0.634 | +| ViT-B-32::openai | 0.560 | 0.749 | 0.532 | 0.699 | 0.413 | 0.629 | +| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.730 | 0.846 | 0.639 | 0.806 | +| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.746 | 0.856 | 0.657 | 0.823 | +| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.712 | 0.824 | 0.620 | 0.789 | +| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.675 | 0.806 | 0.570 | 0.751 | +| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.675 | 0.806 | 0.570 | 0.751 | +| ViT-L-14::openai | 0.610 | 0.793 | 0.599 | 0.767 | 0.494 | 0.717 | +| ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | + +## Zero-shot classification + + ````{dropdown} Zero-shot retrieval: MS COCO Captions | model_fullname | image_retrieval_recall@5 | text_retrieval_recall@5 | From 5b2f7824bc5120dc3f47355f65474a4dd60ecc6a Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 16:54:34 +0800 Subject: [PATCH 07/34] docs: update classification --- docs/user-guides/benchmark.md | 61 ++++++++++++++++------------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 2c65d5050..4d8524293 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -71,9 +71,6 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14::openai | 0.610 | 0.793 | 0.599 | 0.767 | 0.494 | 0.717 | | ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | -## Zero-shot classification - - ````{dropdown} Zero-shot retrieval: MS COCO Captions | model_fullname | image_retrieval_recall@5 | text_retrieval_recall@5 | @@ -106,34 +103,32 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) ```` -````{dropdown} Zero-shot classification: ImageNetV2 - -| model_fullname | acc1 | acc5 | mean_per_class_recall | -|----------------------------------|--------|--------|-----------------------| -| RN50::openai | 0.5287 | 0.8148 | 0.5291 | -| RN50::yfcc15m | 0.2139 | 0.4253 | 0.2145 | -| RN50::cc12m | 0.2238 | 0.4563 | 0.2244 | -| RN101::openai | 0.5608 | 0.8314 | 0.5617 | -| RN101::yfcc15m | 0.2212 | 0.4397 | 0.2216 | -| RN50x4::openai | 0.5944 | 0.8584 | 0.5946 | -| RN50x16::openai | 0.6427 | 0.8837 | 0.643 | -| RN50x64::openai | 0.6703 | 0.907 | 0.6702 | -| ViT-B-32::openai | 0.5594 | 0.8339 | 0.5595 | -| ViT-B-32::laion400m_e31 | 0.5226 | 0.794 | 0.5233 | -| ViT-B-32::laion400m_e32 | 0.5232 | 0.7947 | 0.5235 | -| ViT-B-32::laion2b_e16 | 0.5729 | 0.8391 | 0.5737 | -| ViT-B-32::laion2b_s34b_b79k | 0.5814 | 0.8392 | 0.5808 | -| ViT-B-16::openai | 0.6186 | 0.8735 | 0.6189 | -| ViT-B-16::laion400m_e31 | 0.5942 | 0.8527 | 0.5941 | -| ViT-B-16::laion400m_e32 | 0.5965 | 0.8542 | 0.5963 | -| ViT-B-16-plus-240::laion400m_e31 | 0.6139 | 0.8631 | 0.6146 | -| ViT-B-16-plus-240::laion400m_e32 | 0.6147 | 0.8646 | 0.614 | -| ViT-L-14::openai | 0.6983 | 0.9092 | 0.6986 | -| ViT-L-14::laion400m_e31 | 0.6543 | 0.886 | 0.6547 | -| ViT-L-14::laion400m_e32 | 0.6539 | 0.8857 | 0.6543 | -| ViT-L-14::laion2b_s32b_b82k | 0.6774 | 0.9024 | 0.6783 | -| ViT-L-14-336::openai | 0.7094 | 0.9164 | 0.7094 | -| ViT-H-14::laion2b_s32b_b79k | 0.7087 | 0.9166 | 0.7091 | -| ViT-g-14::laion2b_s12b_b42k | 0.6956 | 0.9086 | 0.6962 | +## Zero-shot classification -```` \ No newline at end of file +| model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | +|---------------------------------|------------|---------|-----------------|--------------|---------------|----------|--------------|-----------|-----------|--------------|---------------|-----------|---------------------------|----------------------|------------------------------------|--------------------------------|---------------------------------|------------------------------|--------------------------------|------------|-------------------------------------| +| RN101 openai | 0.561 | 0.651 | 0.780 | 0.807 | 0.476 | 0.432 | 0.652 | 0.869 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | +| RN101 yfcc15m | 0.221 | 0.243 | 0.469 | 0.299 | 0.125 | 0.117 | 0.210 | 0.177 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | +| RN50 cc12m | 0.224 | 0.438 | 0.582 | 0.395 | 0.178 | 0.135 | 0.095 | 0.331 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | +| RN50 openai | 0.529 | 0.650 | 0.772 | 0.715 | 0.403 | 0.415 | 0.660 | 0.857 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | +| RN50 yfcc15m | 0.214 | 0.215 | 0.402 | 0.291 | 0.116 | 0.122 | 0.167 | 0.174 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | +| RN50x16 openai | 0.643 | 0.680 | 0.810 | 0.813 | 0.522 | 0.524 | 0.724 | 0.898 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | +| RN50x4 openai | 0.594 | 0.682 | 0.781 | 0.794 | 0.451 | 0.486 | 0.698 | 0.887 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | +| RN50x64 openai | 0.670 | 0.740 | 0.834 | 0.851 | 0.598 | 0.531 | 0.788 | 0.936 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | +| ViT-B-16 laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.917 | 0.712 | 0.513 | 0.694 | 0.892 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | +| ViT-B-16 laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.917 | 0.712 | 0.513 | 0.692 | 0.892 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | +| ViT-B-16 openai | 0.619 | 0.783 | 0.819 | 0.908 | 0.669 | 0.449 | 0.712 | 0.890 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | +| ViT-B-16-plus-240 laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.925 | 0.733 | 0.555 | 0.706 | 0.904 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | +| ViT-B-16-plus-240 laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.928 | 0.738 | 0.555 | 0.711 | 0.902 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | +| ViT-B-32 laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.941 | 0.754 | 0.539 | 0.691 | 0.893 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | +| ViT-B-32 laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.936 | 0.755 | 0.557 | 0.716 | 0.909 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | +| ViT-B-32 laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.883 | 0.678 | 0.521 | 0.659 | 0.856 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | +| ViT-B-32 laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.885 | 0.677 | 0.523 | 0.658 | 0.854 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | +| ViT-B-32 openai | 0.559 | 0.764 | 0.815 | 0.898 | 0.643 | 0.443 | 0.664 | 0.873 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | +| ViT-g-14 laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.971 | 0.839 | 0.682 | 0.776 | 0.943 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | +| ViT-H-14 laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.975 | 0.847 | 0.678 | 0.801 | 0.945 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | +| ViT-L-14 laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.966 | 0.833 | 0.629 | 0.758 | 0.932 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | +| ViT-L-14 laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.947 | 0.774 | 0.598 | 0.757 | 0.917 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | +| ViT-L-14 laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.946 | 0.774 | 0.605 | 0.756 | 0.919 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | +| ViT-L-14 openai | 0.698 | 0.783 | 0.835 | 0.956 | 0.758 | 0.554 | 0.792 | 0.932 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | +| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | \ No newline at end of file From e4918ea02fe34ce2de88352c691e3adf610084dc Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 17:08:08 +0800 Subject: [PATCH 08/34] chore: test html table --- docs/user-guides/benchmark.md | 665 ++++++++++++++++++++++++++++++++-- 1 file changed, 635 insertions(+), 30 deletions(-) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index 4d8524293..eb61aa049 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -71,37 +71,642 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14::openai | 0.610 | 0.793 | 0.599 | 0.767 | 0.494 | 0.717 | | ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | -````{dropdown} Zero-shot retrieval: MS COCO Captions -| model_fullname | image_retrieval_recall@5 | text_retrieval_recall@5 | -|----------------------------------|--------------------------|-------------------------| -| RN50::openai | 0.5291883349 | 0.7282000184 | -| RN50::yfcc15m | 0.3610555828 | 0.5338000059 | -| RN50::cc12m | 0.4464214444 | 0.6065999866 | -| RN101::openai | 0.5550180078 | 0.7447999716 | -| RN101::yfcc15m | 0.3760095835 | 0.5490000248 | -| RN50x4::openai | 0.5814074278 | 0.7670000196 | -| RN50x16::openai | 0.6001599431 | 0.7868000269 | -| RN50x64::openai | 0.5992003083 | 0.8033999801 | -| ViT-B-32::openai | 0.5596161485 | 0.7491999865 | -| ViT-B-32::laion400m_e31 | 0.600039959 | 0.7630000114 | -| ViT-B-32::laion400m_e32 | 0.6000000238 | 0.7645999789 | -| ViT-B-32::laion2b_e16 | 0.6468212605 | 0.7950000167 | -| ViT-B-32::laion2b_s34b_b79k | 0.6540184021 | 0.7983999848 | -| ViT-B-16::openai | 0.5842063427 | 0.7671999931 | -| ViT-B-16::laion400m_e31 | 0.6368252635 | 0.7961999774 | -| ViT-B-16::laion400m_e32 | 0.6363854408 | 0.7964000106 | -| ViT-B-16-plus-240::laion400m_e31 | 0.6604158282 | 0.8090000153 | -| ViT-B-16-plus-240::laion400m_e32 | 0.6618952155 | 0.8108000159 | -| ViT-L-14::openai | 0.610355854 | 0.793200016 | -| ViT-L-14::laion400m_e31 | 0.679688096 | 0.82099998 | -| ViT-L-14::laion400m_e32 | 0.6801279783 | 0.8212000132 | -| ViT-L-14::laion2b_s32b_b82k | 0.7109556198 | 0.8399999738 | -| ViT-L-14-336::openai | 0.6162734628 | 0.8123999834 | -| ViT-H-14::laion2b_s32b_b79k | 0.7339064479 | 0.8605999947 | -| ViT-g-14::laion2b_s12b_b42k | 0.7235905528 | 0.853399992 | - -```` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
model_fullnameimagenetv2voc2007Class 1Class 2Class 3
vtab/caltech101vtab/cifar10vtab/cifar100vtab/dtdvtab/flowersvtab/petsvtab/svhnvtab/eurosatvtab/resisc45vtab/pcamvtab/diabetic_retinopathyvtab/clevr_count_allvtab/clevr_closest_object_distancevtab/dsprites_label_x_positionvtab/dsprites_label_orientationvtab/smallnorb_label_azimuthvtab/smallnorb_label_elevationvtab/dmlabvtab/kitti_closest_vehicle_distance
RN101 openai0.5610.6510.7800.8070.4760.4320.6520.8690.2260.3140.5470.5830.2800.2420.1300.0310.0210.0540.1110.1390.263
RN101 yfcc15m0.2210.2430.4690.2990.1250.1170.2100.1770.1370.1510.0990.4790.5840.1090.1590.0310.0190.0550.0970.1530.252
RN50 cc12m0.2240.4380.5820.3950.1780.1350.0950.3310.1020.1480.1170.5350.2930.1840.2220.0310.0250.0470.0960.1610.155
RN50 openai0.5290.6500.7720.7150.4030.4150.6600.8570.3030.4080.4530.6360.1710.2170.1480.0340.0140.0560.1100.1450.170
RN50 yfcc15m0.2140.2150.4020.2910.1160.1220.1670.1740.1570.1720.1230.5330.3580.1510.1580.0320.0240.0530.1200.1600.336
RN50x16 openai0.6430.6800.8100.8130.5220.5240.7240.8980.4090.4330.5890.6250.7150.1950.2130.0300.0260.0500.1160.1460.229
RN50x4 openai0.5940.6820.7810.7940.4510.4860.6980.8870.3670.3350.5320.5690.3180.2050.0820.0310.0260.0560.1080.1620.233
RN50x64 openai0.6700.7400.8340.8510.5980.5310.7880.9360.4810.5770.6280.5390.0730.2270.2000.0340.0250.0560.1250.1580.311
ViT-B-16 laion400m_e310.5940.7670.8380.9170.7120.5130.6940.8920.3800.5030.5850.5930.0620.2890.2450.0310.0300.0590.1000.1520.200
ViT-B-16 laion400m_e320.5970.7680.8370.9170.7120.5130.6920.8920.3850.5010.5850.5980.0770.2870.2450.0320.0290.0600.0990.1510.183
ViT-B-16 openai0.6190.7830.8190.9080.6690.4490.7120.8900.3130.5590.5820.5070.0360.2090.1580.0300.0230.0530.1220.1550.263
ViT-B-16-plus-240 laion400m_e310.6140.7640.8320.9250.7330.5550.7060.9040.3550.5690.6150.5510.0930.2400.1590.0410.0260.0560.1110.1490.280
ViT-B-16-plus-240 laion400m_e320.6150.7640.8330.9280.7380.5550.7110.9020.3620.5810.6130.5510.0950.2380.1600.0430.0270.0540.1100.1480.281
ViT-B-32 laion2b_e160.5730.7880.8310.9410.7540.5390.6910.8930.3880.5030.6190.5060.1950.1920.1670.0310.0240.0520.1100.1890.176
ViT-B-32 laion2b_s34b_b79k0.5810.7910.8390.9360.7550.5570.7160.9090.4100.4820.6100.5980.7340.1530.1890.0290.0340.0620.1130.1590.262
ViT-B-32 laion400m_e310.5230.7310.8180.8830.6780.5210.6590.8560.2200.4700.5100.5490.2590.1550.1610.0330.0210.0530.1170.1730.122
ViT-B-32 laion400m_e320.5230.7330.8170.8850.6770.5230.6580.8540.2230.4760.5100.5480.2400.1530.1610.0330.0210.0540.1170.1730.118
ViT-B-32 openai0.5590.7640.8150.8980.6430.4430.6640.8730.1350.5040.5370.6230.4470.2320.1640.0370.0240.0610.1270.1930.274
ViT-g-14 laion2b_s12b_b42k0.6960.8110.8510.9710.8390.6820.7760.9430.6030.6480.7180.5600.5800.3320.1750.0360.0310.0600.1150.1900.138
ViT-H-14 laion2b_s32b_b79k0.7090.7770.8500.9750.8470.6780.8010.9450.5630.7260.6990.5420.2970.2680.1690.0320.0270.0540.1110.1400.110
ViT-L-14 laion2b_s32b_b82k0.6770.8050.8510.9660.8330.6290.7580.9320.4590.6460.6680.5630.1160.3120.1610.0320.0200.0560.1080.2240.229
ViT-L-14 laion400m_e310.6540.7580.8390.9470.7740.5980.7570.9170.3780.6320.6710.4870.0580.2420.1490.0300.0260.0530.1090.1860.200
ViT-L-14 laion400m_e320.6540.7560.8390.9460.7740.6050.7560.9190.3800.6220.6750.4930.0610.2430.1490.0300.0260.0530.1100.1860.203
ViT-L-14 openai0.6980.7830.8350.9560.7580.5540.7920.9320.5710.6260.6330.5200.7330.1940.1610.0320.0230.0450.1150.1630.218
ViT-L-14-336 openai0.7090.7810.8370.9490.7440.5560.7830.9370.5600.6150.6380.6080.7330.2000.1580.0320.0240.0460.1130.1580.262
## Zero-shot classification From 4ab55ecc7e7397abe8906b33dc537954799819e1 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 17:12:56 +0800 Subject: [PATCH 09/34] chore: update css --- docs/user-guides/benchmark.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index eb61aa049..b2c0e70c9 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -72,6 +72,15 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | + From 24fff82e72b0ff4c1f5c27f3a63e4538913ffdcb Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 18:34:35 +0800 Subject: [PATCH 10/34] chore: test rst --- docs/user-guides/benchmark.md | 645 ---------------------------------- docs/user-guides/test.rst | 17 + 2 files changed, 17 insertions(+), 645 deletions(-) create mode 100644 docs/user-guides/test.rst diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.md index b2c0e70c9..30b9fecd1 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.md @@ -72,651 +72,6 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
model_fullnameimagenetv2voc2007Class 1Class 2Class 3
vtab/caltech101vtab/cifar10vtab/cifar100vtab/dtdvtab/flowersvtab/petsvtab/svhnvtab/eurosatvtab/resisc45vtab/pcamvtab/diabetic_retinopathyvtab/clevr_count_allvtab/clevr_closest_object_distancevtab/dsprites_label_x_positionvtab/dsprites_label_orientationvtab/smallnorb_label_azimuthvtab/smallnorb_label_elevationvtab/dmlabvtab/kitti_closest_vehicle_distance
RN101 openai0.5610.6510.7800.8070.4760.4320.6520.8690.2260.3140.5470.5830.2800.2420.1300.0310.0210.0540.1110.1390.263
RN101 yfcc15m0.2210.2430.4690.2990.1250.1170.2100.1770.1370.1510.0990.4790.5840.1090.1590.0310.0190.0550.0970.1530.252
RN50 cc12m0.2240.4380.5820.3950.1780.1350.0950.3310.1020.1480.1170.5350.2930.1840.2220.0310.0250.0470.0960.1610.155
RN50 openai0.5290.6500.7720.7150.4030.4150.6600.8570.3030.4080.4530.6360.1710.2170.1480.0340.0140.0560.1100.1450.170
RN50 yfcc15m0.2140.2150.4020.2910.1160.1220.1670.1740.1570.1720.1230.5330.3580.1510.1580.0320.0240.0530.1200.1600.336
RN50x16 openai0.6430.6800.8100.8130.5220.5240.7240.8980.4090.4330.5890.6250.7150.1950.2130.0300.0260.0500.1160.1460.229
RN50x4 openai0.5940.6820.7810.7940.4510.4860.6980.8870.3670.3350.5320.5690.3180.2050.0820.0310.0260.0560.1080.1620.233
RN50x64 openai0.6700.7400.8340.8510.5980.5310.7880.9360.4810.5770.6280.5390.0730.2270.2000.0340.0250.0560.1250.1580.311
ViT-B-16 laion400m_e310.5940.7670.8380.9170.7120.5130.6940.8920.3800.5030.5850.5930.0620.2890.2450.0310.0300.0590.1000.1520.200
ViT-B-16 laion400m_e320.5970.7680.8370.9170.7120.5130.6920.8920.3850.5010.5850.5980.0770.2870.2450.0320.0290.0600.0990.1510.183
ViT-B-16 openai0.6190.7830.8190.9080.6690.4490.7120.8900.3130.5590.5820.5070.0360.2090.1580.0300.0230.0530.1220.1550.263
ViT-B-16-plus-240 laion400m_e310.6140.7640.8320.9250.7330.5550.7060.9040.3550.5690.6150.5510.0930.2400.1590.0410.0260.0560.1110.1490.280
ViT-B-16-plus-240 laion400m_e320.6150.7640.8330.9280.7380.5550.7110.9020.3620.5810.6130.5510.0950.2380.1600.0430.0270.0540.1100.1480.281
ViT-B-32 laion2b_e160.5730.7880.8310.9410.7540.5390.6910.8930.3880.5030.6190.5060.1950.1920.1670.0310.0240.0520.1100.1890.176
ViT-B-32 laion2b_s34b_b79k0.5810.7910.8390.9360.7550.5570.7160.9090.4100.4820.6100.5980.7340.1530.1890.0290.0340.0620.1130.1590.262
ViT-B-32 laion400m_e310.5230.7310.8180.8830.6780.5210.6590.8560.2200.4700.5100.5490.2590.1550.1610.0330.0210.0530.1170.1730.122
ViT-B-32 laion400m_e320.5230.7330.8170.8850.6770.5230.6580.8540.2230.4760.5100.5480.2400.1530.1610.0330.0210.0540.1170.1730.118
ViT-B-32 openai0.5590.7640.8150.8980.6430.4430.6640.8730.1350.5040.5370.6230.4470.2320.1640.0370.0240.0610.1270.1930.274
ViT-g-14 laion2b_s12b_b42k0.6960.8110.8510.9710.8390.6820.7760.9430.6030.6480.7180.5600.5800.3320.1750.0360.0310.0600.1150.1900.138
ViT-H-14 laion2b_s32b_b79k0.7090.7770.8500.9750.8470.6780.8010.9450.5630.7260.6990.5420.2970.2680.1690.0320.0270.0540.1110.1400.110
ViT-L-14 laion2b_s32b_b82k0.6770.8050.8510.9660.8330.6290.7580.9320.4590.6460.6680.5630.1160.3120.1610.0320.0200.0560.1080.2240.229
ViT-L-14 laion400m_e310.6540.7580.8390.9470.7740.5980.7570.9170.3780.6320.6710.4870.0580.2420.1490.0300.0260.0530.1090.1860.200
ViT-L-14 laion400m_e320.6540.7560.8390.9460.7740.6050.7560.9190.3800.6220.6750.4930.0610.2430.1490.0300.0260.0530.1100.1860.203
ViT-L-14 openai0.6980.7830.8350.9560.7580.5540.7920.9320.5710.6260.6330.5200.7330.1940.1610.0320.0230.0450.1150.1630.218
ViT-L-14-336 openai0.7090.7810.8370.9490.7440.5560.7830.9370.5600.6150.6380.6080.7330.2000.1580.0320.0240.0460.1130.1580.262
- ## Zero-shot classification | model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | diff --git a/docs/user-guides/test.rst b/docs/user-guides/test.rst new file mode 100644 index 000000000..d45eb0868 --- /dev/null +++ b/docs/user-guides/test.rst @@ -0,0 +1,17 @@ +Test +==== + +Test +^^^^ + ++------------+------------+-----------+ +| Header 1 | Header 2 | Header 3 | ++============+============+===========+ +| body row 1 | column 2 | column 3 | ++------------+------------+-----------+ +| body row 2 | Cells may span columns.| ++------------+------------+-----------+ +| body row 3 | Cells may | - Cells | ++------------+ span rows. | - contain | +| body row 4 | | - blocks. | ++------------+------------+-----------+ \ No newline at end of file From 2ee049846111ddd9308140b4f196380b1fb1d168 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 18:35:05 +0800 Subject: [PATCH 11/34] chore: test rst --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index f243fc404..52d5dde1a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -180,6 +180,7 @@ user-guides/server user-guides/retriever user-guides/benchmark user-guides/faq +user-guides/test ``` ```{toctree} From f01245b5f71ccae359f579628c36310689f9748f Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 18:40:17 +0800 Subject: [PATCH 12/34] chore: test --- docs/user-guides/test.rst | 64 ++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/docs/user-guides/test.rst b/docs/user-guides/test.rst index d45eb0868..40ab72ef3 100644 --- a/docs/user-guides/test.rst +++ b/docs/user-guides/test.rst @@ -4,14 +4,56 @@ Test Test ^^^^ -+------------+------------+-----------+ -| Header 1 | Header 2 | Header 3 | -+============+============+===========+ -| body row 1 | column 2 | column 3 | -+------------+------------+-----------+ -| body row 2 | Cells may span columns.| -+------------+------------+-----------+ -| body row 3 | Cells may | - Cells | -+------------+ span rows. | - contain | -| body row 4 | | - blocks. | -+------------+------------+-----------+ \ No newline at end of file ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | ++=================================+============+=========+=================+==============+===============+==========+==============+===========+===========+==============+===============+===========+===========================+======================+====================================+================================+=================================+==============================+================================+============+=====================================+ +| RN101 openai | 0.561 | 0.651 | 0.780 | 0.807 | 0.476 | 0.432 | 0.652 | 0.869 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN101 yfcc15m | 0.221 | 0.243 | 0.469 | 0.299 | 0.125 | 0.117 | 0.210 | 0.177 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50 cc12m | 0.224 | 0.438 | 0.582 | 0.395 | 0.178 | 0.135 | 0.095 | 0.331 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50 openai | 0.529 | 0.650 | 0.772 | 0.715 | 0.403 | 0.415 | 0.660 | 0.857 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50 yfcc15m | 0.214 | 0.215 | 0.402 | 0.291 | 0.116 | 0.122 | 0.167 | 0.174 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50x16 openai | 0.643 | 0.680 | 0.810 | 0.813 | 0.522 | 0.524 | 0.724 | 0.898 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50x4 openai | 0.594 | 0.682 | 0.781 | 0.794 | 0.451 | 0.486 | 0.698 | 0.887 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| RN50x64 openai | 0.670 | 0.740 | 0.834 | 0.851 | 0.598 | 0.531 | 0.788 | 0.936 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-16 laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.917 | 0.712 | 0.513 | 0.694 | 0.892 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-16 laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.917 | 0.712 | 0.513 | 0.692 | 0.892 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-16 openai | 0.619 | 0.783 | 0.819 | 0.908 | 0.669 | 0.449 | 0.712 | 0.890 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-16-plus-240 laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.925 | 0.733 | 0.555 | 0.706 | 0.904 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-16-plus-240 laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.928 | 0.738 | 0.555 | 0.711 | 0.902 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-32 laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.941 | 0.754 | 0.539 | 0.691 | 0.893 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-32 laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.936 | 0.755 | 0.557 | 0.716 | 0.909 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-32 laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.883 | 0.678 | 0.521 | 0.659 | 0.856 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-32 laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.885 | 0.677 | 0.523 | 0.658 | 0.854 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-B-32 openai | 0.559 | 0.764 | 0.815 | 0.898 | 0.643 | 0.443 | 0.664 | 0.873 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-g-14 laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.971 | 0.839 | 0.682 | 0.776 | 0.943 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-H-14 laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.975 | 0.847 | 0.678 | 0.801 | 0.945 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14 laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.966 | 0.833 | 0.629 | 0.758 | 0.932 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14 laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.947 | 0.774 | 0.598 | 0.757 | 0.917 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14 laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.946 | 0.774 | 0.605 | 0.756 | 0.919 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14 openai | 0.698 | 0.783 | 0.835 | 0.956 | 0.758 | 0.554 | 0.792 | 0.932 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ \ No newline at end of file From 4d8f5c14d9d967d02238dcf809d2a2595447aad5 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 18:55:27 +0800 Subject: [PATCH 13/34] fix: use rst in benchmark --- .../{benchmark.md => benchmark.rst} | 148 +++++++++++------- docs/user-guides/test.rst | 59 ------- 2 files changed, 88 insertions(+), 119 deletions(-) rename docs/user-guides/{benchmark.md => benchmark.rst} (57%) delete mode 100644 docs/user-guides/test.rst diff --git a/docs/user-guides/benchmark.md b/docs/user-guides/benchmark.rst similarity index 57% rename from docs/user-guides/benchmark.md rename to docs/user-guides/benchmark.rst index 30b9fecd1..965d5abf2 100644 --- a/docs/user-guides/benchmark.md +++ b/docs/user-guides/benchmark.rst @@ -7,69 +7,97 @@ In order to evaluate the performance of different CLIP models, we conducted a be We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. -| Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | -|---------------------------------------|-----------------|---------------------|----------------------| -| RN50::openai | **244** | 2.99 | **1.36** | -| RN50::yfcc15m | 389 | 2.86 | **1.36** | -| RN50::cc12m | 389 | **2.84** | **1.36** | -| RN101::openai | 278 | 3.05 | 1.40 | -| RN101::yfcc15m | 457 | 2.88 | 1.40 | -| RN50x4::openai | 402 | 3.23 | 1.63 | -| RN50x16::openai | 631 | 3.63 | 2.02 | -| RN50x64::openai | 1291 | 4.08 | 2.98 | -| ViT-B-32::openai | 338 | 3.20 | 1.40 | -| ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | -| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | -| ViT-B-16::openai | 335 | 3.20 | 1.44 | -| ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | -| ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | -| ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | -| ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | -| ViT-L-14::openai | 890 | 3.66 | 2.04 | -| ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | -| ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | -| ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | -| ViT-L-14-336::openai | 891 | 3.74 | 2.23 | -| ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | -| ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | -| M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | -| M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | -| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | -| M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | - ++----------------------------------------+------------------+----------------------+-----------------------+ +| Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | ++========================================+==================+======================+=======================+ +| RN50::openai | 244 | 2.99 | 1.36 | +| RN50::yfcc15m | 389 | 2.86 | 1.36 | +| RN50::cc12m | 389 | 2.84 | 1.36 | +| RN101::openai | 278 | 3.05 | 1.40 | +| RN101::yfcc15m | 457 | 2.88 | 1.40 | +| RN50x4::openai | 402 | 3.23 | 1.63 | +| RN50x16::openai | 631 | 3.63 | 2.02 | +| RN50x64::openai | 1291 | 4.08 | 2.98 | +| ViT-B-32::openai | 338 | 3.20 | 1.40 | +| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | +| ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | +| ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | +| ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | +| ViT-B-16::openai | 335 | 3.20 | 1.44 | +| ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | +| ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | +| ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | +| ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | +| ViT-L-14::openai | 890 | 3.66 | 2.04 | +| ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | +| ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | +| ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | +| ViT-L-14-336::openai | 891 | 3.74 | 2.23 | +| ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | +| ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | +| M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | +| M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | +| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | +| M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | ++----------------------------------------+------------------+----------------------+-----------------------+ ## Zero-shot retrieval -| Model | COCO Caption | | Flickr 8k | | Flickr 30k | | -|----------------------------------|--------------|-------|-----------|-------|------------|-------| -| | Image | Text | Image | Text | Image | Text | -| RN101::openai | 0.555 | 0.745 | 0.523 | 0.694 | 0.415 | 0.629 | -| RN101::yfcc15m | 0.376 | 0.549 | 0.251 | 0.417 | 0.156 | 0.296 | -| RN50::cc12m | 0.446 | 0.607 | 0.302 | 0.435 | 0.204 | 0.316 | -| RN50::openai | 0.529 | 0.728 | 0.504 | 0.690 | 0.392 | 0.621 | -| RN50::yfcc15m | 0.361 | 0.534 | 0.238 | 0.394 | 0.146 | 0.278 | -| RN50x16::openai | 0.600 | 0.787 | 0.597 | 0.768 | 0.496 | 0.713 | -| RN50x4::openai | 0.581 | 0.767 | 0.558 | 0.729 | 0.451 | 0.671 | -| RN50x64::openai | 0.599 | 0.803 | 0.629 | 0.790 | 0.534 | 0.756 | -| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.620 | 0.765 | 0.506 | 0.697 | -| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.620 | 0.767 | 0.508 | 0.697 | -| ViT-B-16::openai | 0.584 | 0.767 | 0.564 | 0.727 | 0.452 | 0.671 | -| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.642 | 0.788 | 0.533 | 0.725 | -| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.644 | 0.791 | 0.535 | 0.727 | -| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.622 | 0.760 | 0.507 | 0.687 | -| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.629 | 0.778 | 0.513 | 0.694 | -| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.562 | 0.736 | 0.438 | 0.633 | -| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.562 | 0.736 | 0.437 | 0.634 | -| ViT-B-32::openai | 0.560 | 0.749 | 0.532 | 0.699 | 0.413 | 0.629 | -| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.730 | 0.846 | 0.639 | 0.806 | -| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.746 | 0.856 | 0.657 | 0.823 | -| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.712 | 0.824 | 0.620 | 0.789 | -| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.675 | 0.806 | 0.570 | 0.751 | -| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.675 | 0.806 | 0.570 | 0.751 | -| ViT-L-14::openai | 0.610 | 0.793 | 0.599 | 0.767 | 0.494 | 0.717 | -| ViT-L-14-336::openai | 0.616 | 0.812 | 0.629 | 0.779 | 0.533 | 0.741 | ++----------------------------------+-------------------------+-------------------------+-------------------------+ +| Model | COCO Caption | Flickr 8k | Flickr 30k | +| +-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| | Image | Text | Average | Image | Text | Average | Image | Text | Average | ++==================================+=======+=======+=========+=======+=======+=========+=======+=======+=========+ +| RN101::openai | 0.555 | 0.745 | 0.650 | 0.523 | 0.694 | 0.608 | 0.415 | 0.629 | 0.522 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN101::yfcc15m | 0.376 | 0.549 | 0.463 | 0.251 | 0.417 | 0.334 | 0.156 | 0.296 | 0.226 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50::cc12m | 0.446 | 0.607 | 0.527 | 0.302 | 0.435 | 0.369 | 0.204 | 0.316 | 0.260 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50::openai | 0.529 | 0.728 | 0.629 | 0.504 | 0.690 | 0.597 | 0.392 | 0.621 | 0.506 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50::yfcc15m | 0.361 | 0.534 | 0.447 | 0.238 | 0.394 | 0.316 | 0.146 | 0.278 | 0.212 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50x16::openai | 0.600 | 0.787 | 0.693 | 0.597 | 0.768 | 0.682 | 0.496 | 0.713 | 0.604 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50x4::openai | 0.581 | 0.767 | 0.674 | 0.558 | 0.729 | 0.643 | 0.451 | 0.671 | 0.561 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| RN50x64::openai | 0.599 | 0.803 | 0.701 | 0.629 | 0.790 | 0.709 | 0.534 | 0.756 | 0.645 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.717 | 0.620 | 0.765 | 0.692 | 0.506 | 0.697 | 0.602 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.716 | 0.620 | 0.767 | 0.694 | 0.508 | 0.697 | 0.603 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-16::openai | 0.584 | 0.767 | 0.676 | 0.564 | 0.727 | 0.646 | 0.452 | 0.671 | 0.561 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.735 | 0.642 | 0.788 | 0.715 | 0.533 | 0.725 | 0.629 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.736 | 0.644 | 0.791 | 0.718 | 0.535 | 0.727 | 0.631 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.721 | 0.622 | 0.760 | 0.691 | 0.507 | 0.687 | 0.597 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.726 | 0.629 | 0.778 | 0.703 | 0.513 | 0.694 | 0.603 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.682 | 0.562 | 0.736 | 0.649 | 0.438 | 0.633 | 0.536 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.682 | 0.562 | 0.736 | 0.649 | 0.437 | 0.634 | 0.536 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-B-32::openai | 0.560 | 0.749 | 0.654 | 0.532 | 0.699 | 0.616 | 0.413 | 0.629 | 0.521 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.797 | 0.746 | 0.856 | 0.801 | 0.657 | 0.823 | 0.740 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.750 | 0.675 | 0.806 | 0.741 | 0.570 | 0.751 | 0.661 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.751 | 0.675 | 0.806 | 0.740 | 0.570 | 0.751 | 0.661 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-L-14::openai | 0.610 | 0.793 | 0.702 | 0.599 | 0.767 | 0.683 | 0.494 | 0.717 | 0.605 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ +| ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | ++----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ ## Zero-shot classification diff --git a/docs/user-guides/test.rst b/docs/user-guides/test.rst deleted file mode 100644 index 40ab72ef3..000000000 --- a/docs/user-guides/test.rst +++ /dev/null @@ -1,59 +0,0 @@ -Test -==== - -Test -^^^^ - -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | -+=================================+============+=========+=================+==============+===============+==========+==============+===========+===========+==============+===============+===========+===========================+======================+====================================+================================+=================================+==============================+================================+============+=====================================+ -| RN101 openai | 0.561 | 0.651 | 0.780 | 0.807 | 0.476 | 0.432 | 0.652 | 0.869 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN101 yfcc15m | 0.221 | 0.243 | 0.469 | 0.299 | 0.125 | 0.117 | 0.210 | 0.177 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 cc12m | 0.224 | 0.438 | 0.582 | 0.395 | 0.178 | 0.135 | 0.095 | 0.331 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 openai | 0.529 | 0.650 | 0.772 | 0.715 | 0.403 | 0.415 | 0.660 | 0.857 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 yfcc15m | 0.214 | 0.215 | 0.402 | 0.291 | 0.116 | 0.122 | 0.167 | 0.174 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x16 openai | 0.643 | 0.680 | 0.810 | 0.813 | 0.522 | 0.524 | 0.724 | 0.898 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x4 openai | 0.594 | 0.682 | 0.781 | 0.794 | 0.451 | 0.486 | 0.698 | 0.887 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x64 openai | 0.670 | 0.740 | 0.834 | 0.851 | 0.598 | 0.531 | 0.788 | 0.936 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.917 | 0.712 | 0.513 | 0.694 | 0.892 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.917 | 0.712 | 0.513 | 0.692 | 0.892 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 openai | 0.619 | 0.783 | 0.819 | 0.908 | 0.669 | 0.449 | 0.712 | 0.890 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16-plus-240 laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.925 | 0.733 | 0.555 | 0.706 | 0.904 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16-plus-240 laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.928 | 0.738 | 0.555 | 0.711 | 0.902 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.941 | 0.754 | 0.539 | 0.691 | 0.893 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.936 | 0.755 | 0.557 | 0.716 | 0.909 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.883 | 0.678 | 0.521 | 0.659 | 0.856 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.885 | 0.677 | 0.523 | 0.658 | 0.854 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 openai | 0.559 | 0.764 | 0.815 | 0.898 | 0.643 | 0.443 | 0.664 | 0.873 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-g-14 laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.971 | 0.839 | 0.682 | 0.776 | 0.943 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-H-14 laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.975 | 0.847 | 0.678 | 0.801 | 0.945 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.966 | 0.833 | 0.629 | 0.758 | 0.932 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.947 | 0.774 | 0.598 | 0.757 | 0.917 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.946 | 0.774 | 0.605 | 0.756 | 0.919 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 openai | 0.698 | 0.783 | 0.835 | 0.956 | 0.758 | 0.554 | 0.792 | 0.932 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ \ No newline at end of file From d395f691f8210a6823b07e7cc992cffb51ed4e4d Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 18:58:13 +0800 Subject: [PATCH 14/34] fix: typo --- docs/index.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 52d5dde1a..f243fc404 100644 --- a/docs/index.md +++ b/docs/index.md @@ -180,7 +180,6 @@ user-guides/server user-guides/retriever user-guides/benchmark user-guides/faq -user-guides/test ``` ```{toctree} From c2591ee55647e9bd0c3b7a9ba16e047bd4c89237 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 19:03:20 +0800 Subject: [PATCH 15/34] fix: rst --- docs/user-guides/benchmark.rst | 42 +++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 965d5abf2..eab9ed5a6 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -1,9 +1,11 @@ -# CLIP Benchmark +CLIP Benchmark +============== In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. You can find the benchmark results in the following tables. The best results are highlighted in bold. They can be used as a guide to choose the best model for your application. -## Basic statistics +Basic statistics +^^^^^^^^^^^^^^^^ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. @@ -41,7 +43,8 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | +----------------------------------------+------------------+----------------------+-----------------------+ -## Zero-shot retrieval +Zero-shot retrieval +^^^^^^^^^^^^^^^^^^^ +----------------------------------+-------------------------+-------------------------+-------------------------+ | Model | COCO Caption | Flickr 8k | Flickr 30k | @@ -100,32 +103,59 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) +----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -## Zero-shot classification +Zero-shot classification +^^^^^^^^^^^^^^^^^^^^^^^^ ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | -|---------------------------------|------------|---------|-----------------|--------------|---------------|----------|--------------|-----------|-----------|--------------|---------------|-----------|---------------------------|----------------------|------------------------------------|--------------------------------|---------------------------------|------------------------------|--------------------------------|------------|-------------------------------------| ++=================================+============+=========+=================+==============+===============+==========+==============+===========+===========+==============+===============+===========+===========================+======================+====================================+================================+=================================+==============================+================================+============+=====================================+ | RN101 openai | 0.561 | 0.651 | 0.780 | 0.807 | 0.476 | 0.432 | 0.652 | 0.869 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN101 yfcc15m | 0.221 | 0.243 | 0.469 | 0.299 | 0.125 | 0.117 | 0.210 | 0.177 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50 cc12m | 0.224 | 0.438 | 0.582 | 0.395 | 0.178 | 0.135 | 0.095 | 0.331 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50 openai | 0.529 | 0.650 | 0.772 | 0.715 | 0.403 | 0.415 | 0.660 | 0.857 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50 yfcc15m | 0.214 | 0.215 | 0.402 | 0.291 | 0.116 | 0.122 | 0.167 | 0.174 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50x16 openai | 0.643 | 0.680 | 0.810 | 0.813 | 0.522 | 0.524 | 0.724 | 0.898 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50x4 openai | 0.594 | 0.682 | 0.781 | 0.794 | 0.451 | 0.486 | 0.698 | 0.887 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | RN50x64 openai | 0.670 | 0.740 | 0.834 | 0.851 | 0.598 | 0.531 | 0.788 | 0.936 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-16 laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.917 | 0.712 | 0.513 | 0.694 | 0.892 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-16 laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.917 | 0.712 | 0.513 | 0.692 | 0.892 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-16 openai | 0.619 | 0.783 | 0.819 | 0.908 | 0.669 | 0.449 | 0.712 | 0.890 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-16-plus-240 laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.925 | 0.733 | 0.555 | 0.706 | 0.904 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-16-plus-240 laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.928 | 0.738 | 0.555 | 0.711 | 0.902 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-32 laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.941 | 0.754 | 0.539 | 0.691 | 0.893 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-32 laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.936 | 0.755 | 0.557 | 0.716 | 0.909 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-32 laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.883 | 0.678 | 0.521 | 0.659 | 0.856 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-32 laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.885 | 0.677 | 0.523 | 0.658 | 0.854 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-B-32 openai | 0.559 | 0.764 | 0.815 | 0.898 | 0.643 | 0.443 | 0.664 | 0.873 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-g-14 laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.971 | 0.839 | 0.682 | 0.776 | 0.943 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-H-14 laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.975 | 0.847 | 0.678 | 0.801 | 0.945 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-L-14 laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.966 | 0.833 | 0.629 | 0.758 | 0.932 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-L-14 laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.947 | 0.774 | 0.598 | 0.757 | 0.917 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-L-14 laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.946 | 0.774 | 0.605 | 0.756 | 0.919 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | ViT-L-14 openai | 0.698 | 0.783 | 0.835 | 0.956 | 0.758 | 0.554 | 0.792 | 0.932 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | -| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | \ No newline at end of file ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ +| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | ++---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ From 444b57d8ba0e19c833ed53d9a8cb7c8005090fd0 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 19:09:07 +0800 Subject: [PATCH 16/34] fix: rst --- docs/user-guides/benchmark.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index eab9ed5a6..c19214edc 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -13,36 +13,65 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | +========================================+==================+======================+=======================+ | RN50::openai | 244 | 2.99 | 1.36 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN50::yfcc15m | 389 | 2.86 | 1.36 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN50::cc12m | 389 | 2.84 | 1.36 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN101::openai | 278 | 3.05 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN101::yfcc15m | 457 | 2.88 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN50x4::openai | 402 | 3.23 | 1.63 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN50x16::openai | 631 | 3.63 | 2.02 | ++----------------------------------------+------------------+----------------------+-----------------------+ | RN50x64::openai | 1291 | 4.08 | 2.98 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-32::openai | 338 | 3.20 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-16::openai | 335 | 3.20 | 1.44 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-L-14::openai | 890 | 3.66 | 2.04 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-L-14-336::openai | 891 | 3.74 | 2.23 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | ++----------------------------------------+------------------+----------------------+-----------------------+ | ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | ++----------------------------------------+------------------+----------------------+-----------------------+ | M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | ++----------------------------------------+------------------+----------------------+-----------------------+ | M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | ++----------------------------------------+------------------+----------------------+-----------------------+ | M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | ++----------------------------------------+------------------+----------------------+-----------------------+ | M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | +----------------------------------------+------------------+----------------------+-----------------------+ + Zero-shot retrieval ^^^^^^^^^^^^^^^^^^^ From 72a1cd1f77a087d3ec830fa61160483b4252c1a3 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 21:32:09 +0800 Subject: [PATCH 17/34] fix: subtitle --- docs/user-guides/benchmark.rst | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index c19214edc..c352fcc34 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -1,22 +1,24 @@ CLIP Benchmark ============== -In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. You can find the benchmark results in the following tables. The best results are highlighted in bold. They can be used as a guide to choose the best model for your application. - +In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. +You can find the benchmark results in the following tables. +The best results are highlighted in bold. +They can be used as a guide to choose the best model for your application. Basic statistics -^^^^^^^^^^^^^^^^ +---------------- -We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. +We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. +----------------------------------------+------------------+----------------------+-----------------------+ | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | +========================================+==================+======================+=======================+ -| RN50::openai | 244 | 2.99 | 1.36 | +| RN50::openai | **244** | 2.99 | **1.36** | +----------------------------------------+------------------+----------------------+-----------------------+ -| RN50::yfcc15m | 389 | 2.86 | 1.36 | +| RN50::yfcc15m | 389 | 2.86 | **1.36** | +----------------------------------------+------------------+----------------------+-----------------------+ -| RN50::cc12m | 389 | 2.84 | 1.36 | +| RN50::cc12m | 389 | **2.84** | **1.36** | +----------------------------------------+------------------+----------------------+-----------------------+ | RN101::openai | 278 | 3.05 | 1.40 | +----------------------------------------+------------------+----------------------+-----------------------+ @@ -71,9 +73,15 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) | M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | +----------------------------------------+------------------+----------------------+-----------------------+ - Zero-shot retrieval -^^^^^^^^^^^^^^^^^^^ +------------------- + +In zero-shot retrieval benchmark, each model is evaluated on the following datasets: COCO Caption, Flickr 8k and Flickr 30k. +For the above datasets, there are five corresponding description sentences for each image written by humans. +The results are reported in terms of top-5 text-to-image retrieval recall, top-5 image-to-text retrieval recall and their averages. +More specifically, the top-5 text-to-image retrieval recall for each retrieved image is either 1 or 0. +It is 1 if the input text matches one of the image descriptions among the top-5. +The top-5 image-to-text retrieval recall for each image is the number of top-5 retrieved texts matching that image descriptions. +----------------------------------+-------------------------+-------------------------+-------------------------+ | Model | COCO Caption | Flickr 8k | Flickr 30k | @@ -131,9 +139,12 @@ Zero-shot retrieval | ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | +----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ - Zero-shot classification -^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------ + +In zero-shot classification benchmark, each model is evaluated on the following datasets: ImageNet V2, VOC 2007 and 19 VTAB datasets. +The results are shown in the following table. +For each dataset, we report the top-1 accuracy. +---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ | model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | From 63d90cdbac5939ff51ff0850219e749fb0208b50 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 22:05:16 +0800 Subject: [PATCH 18/34] docs: classification benchmark --- docs/user-guides/benchmark.rst | 222 +++++++++++++++++---------------- 1 file changed, 112 insertions(+), 110 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index c352fcc34..57613360a 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -76,126 +76,128 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) Zero-shot retrieval ------------------- -In zero-shot retrieval benchmark, each model is evaluated on the following datasets: COCO Caption, Flickr 8k and Flickr 30k. +In zero-shot retrieval benchmark, each model is evaluated on the following datasets: COCO Caption, Flickr8k and Flickr30k. For the above datasets, there are five corresponding description sentences for each image written by humans. The results are reported in terms of top-5 text-to-image retrieval recall, top-5 image-to-text retrieval recall and their averages. More specifically, the top-5 text-to-image retrieval recall for each retrieved image is either 1 or 0. It is 1 if the input text matches one of the image descriptions among the top-5. The top-5 image-to-text retrieval recall for each image is the number of top-5 retrieved texts matching that image descriptions. -+----------------------------------+-------------------------+-------------------------+-------------------------+ -| Model | COCO Caption | Flickr 8k | Flickr 30k | -| +-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| | Image | Text | Average | Image | Text | Average | Image | Text | Average | -+==================================+=======+=======+=========+=======+=======+=========+=======+=======+=========+ -| RN101::openai | 0.555 | 0.745 | 0.650 | 0.523 | 0.694 | 0.608 | 0.415 | 0.629 | 0.522 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN101::yfcc15m | 0.376 | 0.549 | 0.463 | 0.251 | 0.417 | 0.334 | 0.156 | 0.296 | 0.226 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50::cc12m | 0.446 | 0.607 | 0.527 | 0.302 | 0.435 | 0.369 | 0.204 | 0.316 | 0.260 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50::openai | 0.529 | 0.728 | 0.629 | 0.504 | 0.690 | 0.597 | 0.392 | 0.621 | 0.506 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50::yfcc15m | 0.361 | 0.534 | 0.447 | 0.238 | 0.394 | 0.316 | 0.146 | 0.278 | 0.212 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50x16::openai | 0.600 | 0.787 | 0.693 | 0.597 | 0.768 | 0.682 | 0.496 | 0.713 | 0.604 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50x4::openai | 0.581 | 0.767 | 0.674 | 0.558 | 0.729 | 0.643 | 0.451 | 0.671 | 0.561 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| RN50x64::openai | 0.599 | 0.803 | 0.701 | 0.629 | 0.790 | 0.709 | 0.534 | 0.756 | 0.645 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.717 | 0.620 | 0.765 | 0.692 | 0.506 | 0.697 | 0.602 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.716 | 0.620 | 0.767 | 0.694 | 0.508 | 0.697 | 0.603 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-16::openai | 0.584 | 0.767 | 0.676 | 0.564 | 0.727 | 0.646 | 0.452 | 0.671 | 0.561 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.735 | 0.642 | 0.788 | 0.715 | 0.533 | 0.725 | 0.629 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.736 | 0.644 | 0.791 | 0.718 | 0.535 | 0.727 | 0.631 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.721 | 0.622 | 0.760 | 0.691 | 0.507 | 0.687 | 0.597 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.726 | 0.629 | 0.778 | 0.703 | 0.513 | 0.694 | 0.603 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.682 | 0.562 | 0.736 | 0.649 | 0.438 | 0.633 | 0.536 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.682 | 0.562 | 0.736 | 0.649 | 0.437 | 0.634 | 0.536 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-B-32::openai | 0.560 | 0.749 | 0.654 | 0.532 | 0.699 | 0.616 | 0.413 | 0.629 | 0.521 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.797 | 0.746 | 0.856 | 0.801 | 0.657 | 0.823 | 0.740 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.750 | 0.675 | 0.806 | 0.741 | 0.570 | 0.751 | 0.661 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.751 | 0.675 | 0.806 | 0.740 | 0.570 | 0.751 | 0.661 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-L-14::openai | 0.610 | 0.793 | 0.702 | 0.599 | 0.767 | 0.683 | 0.494 | 0.717 | 0.605 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ -| ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | -+----------------------------------+-------+-------+---------+-------+-------+---------+-------+-------+---------+ ++----------------------------------+-------------------------------------+-------------------------------------+-------------------------------------+ +| Model | COCO Caption | Flickr8k | Flickr30k | +| +-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| | Text->image | Image->text | Average | Text->image | Image->text | Average | Text->image | Image->text | Average | ++==================================+=============+=============+=========+=============+=============+=========+=============+=============+=========+ +| RN101::openai | 0.555 | 0.745 | 0.650 | 0.523 | 0.694 | 0.608 | 0.415 | 0.629 | 0.522 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN101::yfcc15m | 0.376 | 0.549 | 0.463 | 0.251 | 0.417 | 0.334 | 0.156 | 0.296 | 0.226 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50::cc12m | 0.446 | 0.607 | 0.527 | 0.302 | 0.435 | 0.369 | 0.204 | 0.316 | 0.260 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50::openai | 0.529 | 0.728 | 0.629 | 0.504 | 0.690 | 0.597 | 0.392 | 0.621 | 0.506 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50::yfcc15m | 0.361 | 0.534 | 0.447 | 0.238 | 0.394 | 0.316 | 0.146 | 0.278 | 0.212 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50x16::openai | 0.600 | 0.787 | 0.693 | 0.597 | 0.768 | 0.682 | 0.496 | 0.713 | 0.604 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50x4::openai | 0.581 | 0.767 | 0.674 | 0.558 | 0.729 | 0.643 | 0.451 | 0.671 | 0.561 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| RN50x64::openai | 0.599 | 0.803 | 0.701 | 0.629 | 0.790 | 0.709 | 0.534 | 0.756 | 0.645 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.717 | 0.620 | 0.765 | 0.692 | 0.506 | 0.697 | 0.602 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.716 | 0.620 | 0.767 | 0.694 | 0.508 | 0.697 | 0.603 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-16::openai | 0.584 | 0.767 | 0.676 | 0.564 | 0.727 | 0.646 | 0.452 | 0.671 | 0.561 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.735 | 0.642 | 0.788 | 0.715 | 0.533 | 0.725 | 0.629 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.736 | 0.644 | 0.791 | 0.718 | 0.535 | 0.727 | 0.631 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.721 | 0.622 | 0.760 | 0.691 | 0.507 | 0.687 | 0.597 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.726 | 0.629 | 0.778 | 0.703 | 0.513 | 0.694 | 0.603 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.682 | 0.562 | 0.736 | 0.649 | 0.438 | 0.633 | 0.536 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.682 | 0.562 | 0.736 | 0.649 | 0.437 | 0.634 | 0.536 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-B-32::openai | 0.560 | 0.749 | 0.654 | 0.532 | 0.699 | 0.616 | 0.413 | 0.629 | 0.521 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.797 | 0.746 | 0.856 | 0.801 | 0.657 | 0.823 | 0.740 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.750 | 0.675 | 0.806 | 0.741 | 0.570 | 0.751 | 0.661 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.751 | 0.675 | 0.806 | 0.740 | 0.570 | 0.751 | 0.661 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-L-14::openai | 0.610 | 0.793 | 0.702 | 0.599 | 0.767 | 0.683 | 0.494 | 0.717 | 0.605 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ +| ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | ++----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ Zero-shot classification ------------------------ -In zero-shot classification benchmark, each model is evaluated on the following datasets: ImageNet V2, VOC 2007 and 19 VTAB datasets. +In zero-shot classification benchmark, each model is evaluated on the following datasets: ImageNetV2, VOC2007 and 19 VTAB datasets. The results are shown in the following table. For each dataset, we report the top-1 accuracy. -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| model_fullname | imagenetv2 | voc2007 | vtab/caltech101 | vtab/cifar10 | vtab/cifar100 | vtab/dtd | vtab/flowers | vtab/pets | vtab/svhn | vtab/eurosat | vtab/resisc45 | vtab/pcam | vtab/diabetic_retinopathy | vtab/clevr_count_all | vtab/clevr_closest_object_distance | vtab/dsprites_label_x_position | vtab/dsprites_label_orientation | vtab/smallnorb_label_azimuth | vtab/smallnorb_label_elevation | vtab/dmlab | vtab/kitti_closest_vehicle_distance | -+=================================+============+=========+=================+==============+===============+==========+==============+===========+===========+==============+===============+===========+===========================+======================+====================================+================================+=================================+==============================+================================+============+=====================================+ -| RN101 openai | 0.561 | 0.651 | 0.780 | 0.807 | 0.476 | 0.432 | 0.652 | 0.869 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN101 yfcc15m | 0.221 | 0.243 | 0.469 | 0.299 | 0.125 | 0.117 | 0.210 | 0.177 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 cc12m | 0.224 | 0.438 | 0.582 | 0.395 | 0.178 | 0.135 | 0.095 | 0.331 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 openai | 0.529 | 0.650 | 0.772 | 0.715 | 0.403 | 0.415 | 0.660 | 0.857 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50 yfcc15m | 0.214 | 0.215 | 0.402 | 0.291 | 0.116 | 0.122 | 0.167 | 0.174 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x16 openai | 0.643 | 0.680 | 0.810 | 0.813 | 0.522 | 0.524 | 0.724 | 0.898 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x4 openai | 0.594 | 0.682 | 0.781 | 0.794 | 0.451 | 0.486 | 0.698 | 0.887 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| RN50x64 openai | 0.670 | 0.740 | 0.834 | 0.851 | 0.598 | 0.531 | 0.788 | 0.936 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.917 | 0.712 | 0.513 | 0.694 | 0.892 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.917 | 0.712 | 0.513 | 0.692 | 0.892 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16 openai | 0.619 | 0.783 | 0.819 | 0.908 | 0.669 | 0.449 | 0.712 | 0.890 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16-plus-240 laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.925 | 0.733 | 0.555 | 0.706 | 0.904 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-16-plus-240 laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.928 | 0.738 | 0.555 | 0.711 | 0.902 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.941 | 0.754 | 0.539 | 0.691 | 0.893 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.936 | 0.755 | 0.557 | 0.716 | 0.909 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.883 | 0.678 | 0.521 | 0.659 | 0.856 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.885 | 0.677 | 0.523 | 0.658 | 0.854 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-B-32 openai | 0.559 | 0.764 | 0.815 | 0.898 | 0.643 | 0.443 | 0.664 | 0.873 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-g-14 laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.971 | 0.839 | 0.682 | 0.776 | 0.943 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-H-14 laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.975 | 0.847 | 0.678 | 0.801 | 0.945 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.966 | 0.833 | 0.629 | 0.758 | 0.932 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.947 | 0.774 | 0.598 | 0.757 | 0.917 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.946 | 0.774 | 0.605 | 0.756 | 0.919 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14 openai | 0.698 | 0.783 | 0.835 | 0.956 | 0.758 | 0.554 | 0.792 | 0.932 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ -| ViT-L-14-336 openai | 0.709 | 0.781 | 0.837 | 0.949 | 0.744 | 0.556 | 0.783 | 0.937 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | -+---------------------------------+------------+---------+-----------------+--------------+---------------+----------+--------------+-----------+-----------+--------------+---------------+-----------+---------------------------+----------------------+------------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+------------+-------------------------------------+ ++----------------------------------+------------+---------+----------------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ +| Model | ImageNetV2 | VOC2007 | VTAB natural | VTAB specialized | VTAB structured | +| | | +------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| | | | Caltech101 | CIFAR-100 | DTD | Flowers102 | Pets | Sun397 | SVHN | EuroSAT | Resisc45 | Patch Camelyon | Retinopathy | Clevr/count | Clevr/distance | dSprites/location | dSprites/orientation | SmallNORB/azimuth | SmallNORB/elevation | DMLab | KITTI/distance | ++==================================+============+=========+============+===========+=======+============+=======+========+=======+=========+==========+================+=============+=============+================+===================+======================+===================+=====================+=======+================+ +| RN50::openai | 0.529 | 0.650 | 0.772 | 0.403 | 0.415 | 0.660 | 0.857 | 0.894 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN50::yfcc15m | 0.214 | 0.215 | 0.402 | 0.116 | 0.122 | 0.167 | 0.174 | 0.127 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN50::cc12m | 0.224 | 0.438 | 0.582 | 0.178 | 0.135 | 0.095 | 0.331 | 0.123 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN101::openai | 0.561 | 0.651 | 0.780 | 0.476 | 0.432 | 0.652 | 0.869 | 0.887 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN101::yfcc15m | 0.221 | 0.243 | 0.469 | 0.125 | 0.117 | 0.210 | 0.177 | 0.128 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN50x4::openai | 0.594 | 0.682 | 0.781 | 0.451 | 0.486 | 0.698 | 0.887 | 0.908 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN50x16::openai | 0.643 | 0.680 | 0.810 | 0.522 | 0.524 | 0.724 | 0.898 | 0.917 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| RN50x64::openai | 0.670 | 0.740 | 0.834 | 0.598 | 0.531 | 0.788 | 0.936 | 0.931 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-32::openai | 0.559 | 0.764 | 0.815 | 0.643 | 0.443 | 0.664 | 0.873 | 0.913 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-32::laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.754 | 0.539 | 0.691 | 0.893 | 0.933 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-32::laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.678 | 0.521 | 0.659 | 0.856 | 0.918 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-32::laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.677 | 0.523 | 0.658 | 0.854 | 0.917 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-32::laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.755 | 0.557 | 0.716 | 0.909 | 0.937 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-16::openai | 0.619 | 0.783 | 0.819 | 0.669 | 0.449 | 0.712 | 0.890 | 0.924 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-16::laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.712 | 0.513 | 0.694 | 0.892 | 0.939 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-16::laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.712 | 0.513 | 0.692 | 0.892 | 0.939 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-16-plus-240::laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.733 | 0.555 | 0.706 | 0.904 | 0.940 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-B-16-plus-240::laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.738 | 0.555 | 0.711 | 0.902 | 0.940 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-L-14::openai | 0.698 | 0.783 | 0.835 | 0.758 | 0.554 | 0.792 | 0.932 | 0.937 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-L-14::laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.774 | 0.598 | 0.757 | 0.917 | 0.950 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-L-14::laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.774 | 0.605 | 0.756 | 0.919 | 0.950 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-L-14::laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.833 | 0.629 | 0.758 | 0.932 | 0.958 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-L-14-336::openai | 0.709 | 0.781 | 0.837 | 0.744 | 0.556 | 0.783 | 0.937 | 0.940 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-H-14::laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.847 | 0.678 | 0.801 | 0.945 | 0.961 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ +| ViT-g-14::laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.839 | 0.682 | 0.776 | 0.943 | 0.962 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | ++----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ \ No newline at end of file From e37aa8b3f137c47c27ae1eb9087327f33ffc3e9e Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 22:30:00 +0800 Subject: [PATCH 19/34] docs: highlight retrieval --- docs/user-guides/benchmark.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 57613360a..8c899b3bb 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -126,7 +126,7 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r +----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ | ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | +----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-H-14::laion2b_s32b_b79k | 0.734 | 0.861 | 0.797 | 0.746 | 0.856 | 0.801 | 0.657 | 0.823 | 0.740 | +| ViT-H-14::laion2b_s32b_b79k | **0.734** | **0.861 ** | **0.797** | **0.746 ** | **0.856** | **0.801** | **0.657** | **0.823** | **0.740** | +----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ | ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | +----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ @@ -144,7 +144,7 @@ Zero-shot classification In zero-shot classification benchmark, each model is evaluated on the following datasets: ImageNetV2, VOC2007 and 19 VTAB datasets. The results are shown in the following table. -For each dataset, we report the top-1 accuracy. +For each dataset, we report the top-1 accuracy, which is whether the top-1 retrieved class of a image matches its true class. +----------------------------------+------------+---------+----------------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ | Model | ImageNetV2 | VOC2007 | VTAB natural | VTAB specialized | VTAB structured | From 5446611737096235c5cc57659d1372c2cbbb5e6e Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 22:43:37 +0800 Subject: [PATCH 20/34] docs: highlight retireval --- docs/user-guides/benchmark.rst | 110 ++++++++++++++++----------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 8c899b3bb..b1f358054 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -83,61 +83,61 @@ More specifically, the top-5 text-to-image retrieval recall for each retrieved i It is 1 if the input text matches one of the image descriptions among the top-5. The top-5 image-to-text retrieval recall for each image is the number of top-5 retrieved texts matching that image descriptions. -+----------------------------------+-------------------------------------+-------------------------------------+-------------------------------------+ -| Model | COCO Caption | Flickr8k | Flickr30k | -| +-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| | Text->image | Image->text | Average | Text->image | Image->text | Average | Text->image | Image->text | Average | -+==================================+=============+=============+=========+=============+=============+=========+=============+=============+=========+ -| RN101::openai | 0.555 | 0.745 | 0.650 | 0.523 | 0.694 | 0.608 | 0.415 | 0.629 | 0.522 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN101::yfcc15m | 0.376 | 0.549 | 0.463 | 0.251 | 0.417 | 0.334 | 0.156 | 0.296 | 0.226 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50::cc12m | 0.446 | 0.607 | 0.527 | 0.302 | 0.435 | 0.369 | 0.204 | 0.316 | 0.260 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50::openai | 0.529 | 0.728 | 0.629 | 0.504 | 0.690 | 0.597 | 0.392 | 0.621 | 0.506 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50::yfcc15m | 0.361 | 0.534 | 0.447 | 0.238 | 0.394 | 0.316 | 0.146 | 0.278 | 0.212 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50x16::openai | 0.600 | 0.787 | 0.693 | 0.597 | 0.768 | 0.682 | 0.496 | 0.713 | 0.604 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50x4::openai | 0.581 | 0.767 | 0.674 | 0.558 | 0.729 | 0.643 | 0.451 | 0.671 | 0.561 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| RN50x64::openai | 0.599 | 0.803 | 0.701 | 0.629 | 0.790 | 0.709 | 0.534 | 0.756 | 0.645 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.717 | 0.620 | 0.765 | 0.692 | 0.506 | 0.697 | 0.602 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.716 | 0.620 | 0.767 | 0.694 | 0.508 | 0.697 | 0.603 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-16::openai | 0.584 | 0.767 | 0.676 | 0.564 | 0.727 | 0.646 | 0.452 | 0.671 | 0.561 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.735 | 0.642 | 0.788 | 0.715 | 0.533 | 0.725 | 0.629 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.736 | 0.644 | 0.791 | 0.718 | 0.535 | 0.727 | 0.631 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.721 | 0.622 | 0.760 | 0.691 | 0.507 | 0.687 | 0.597 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.726 | 0.629 | 0.778 | 0.703 | 0.513 | 0.694 | 0.603 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.682 | 0.562 | 0.736 | 0.649 | 0.438 | 0.633 | 0.536 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.682 | 0.562 | 0.736 | 0.649 | 0.437 | 0.634 | 0.536 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-B-32::openai | 0.560 | 0.749 | 0.654 | 0.532 | 0.699 | 0.616 | 0.413 | 0.629 | 0.521 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-H-14::laion2b_s32b_b79k | **0.734** | **0.861 ** | **0.797** | **0.746 ** | **0.856** | **0.801** | **0.657** | **0.823** | **0.740** | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.750 | 0.675 | 0.806 | 0.741 | 0.570 | 0.751 | 0.661 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.751 | 0.675 | 0.806 | 0.740 | 0.570 | 0.751 | 0.661 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-L-14::openai | 0.610 | 0.793 | 0.702 | 0.599 | 0.767 | 0.683 | 0.494 | 0.717 | 0.605 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ -| ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | -+----------------------------------+-------------+-------------+---------+-------------+-------------+---------+-------------+-------------+---------+ ++----------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------+ +| Model | COCO Caption | Flickr 8k | Flickr 30k | +| +---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| | Text to image | Image to text | Average | Text to image | Image to text | Average | Text to image | Image to text | Average | ++==================================+===============+===============+===========+===============+===============+===========+===============+===============+===========+ +| RN50::openai | 0.529 | 0.728 | 0.629 | 0.504 | 0.690 | 0.597 | 0.392 | 0.621 | 0.506 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN50::yfcc15m | 0.361 | 0.534 | 0.447 | 0.238 | 0.394 | 0.316 | 0.146 | 0.278 | 0.212 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN50::cc12m | 0.446 | 0.607 | 0.527 | 0.302 | 0.435 | 0.369 | 0.204 | 0.316 | 0.260 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN101::openai | 0.555 | 0.745 | 0.650 | 0.523 | 0.694 | 0.608 | 0.415 | 0.629 | 0.522 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN101::yfcc15m | 0.376 | 0.549 | 0.463 | 0.251 | 0.417 | 0.334 | 0.156 | 0.296 | 0.226 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN50x4::openai | 0.581 | 0.767 | 0.674 | 0.558 | 0.729 | 0.643 | 0.451 | 0.671 | 0.561 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN50x16::openai | 0.600 | 0.787 | 0.693 | 0.597 | 0.768 | 0.682 | 0.496 | 0.713 | 0.604 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| RN50x64::openai | 0.599 | 0.803 | 0.701 | 0.629 | 0.790 | 0.709 | 0.534 | 0.756 | 0.645 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-32::openai | 0.560 | 0.749 | 0.654 | 0.532 | 0.699 | 0.616 | 0.413 | 0.629 | 0.521 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-32::laion2b_e16 | 0.647 | 0.795 | 0.721 | 0.622 | 0.760 | 0.691 | 0.507 | 0.687 | 0.597 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-32::laion400m_e31 | 0.600 | 0.763 | 0.682 | 0.562 | 0.736 | 0.649 | 0.438 | 0.633 | 0.536 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-32::laion400m_e32 | 0.600 | 0.765 | 0.682 | 0.562 | 0.736 | 0.649 | 0.437 | 0.634 | 0.536 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-32::laion2b_s34b_b79k | 0.654 | 0.798 | 0.726 | 0.629 | 0.778 | 0.703 | 0.513 | 0.694 | 0.603 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-16::openai | 0.584 | 0.767 | 0.676 | 0.564 | 0.727 | 0.646 | 0.452 | 0.671 | 0.561 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-16::laion400m_e31 | 0.637 | 0.796 | 0.717 | 0.620 | 0.765 | 0.692 | 0.506 | 0.697 | 0.602 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-16::laion400m_e32 | 0.636 | 0.796 | 0.716 | 0.620 | 0.767 | 0.694 | 0.508 | 0.697 | 0.603 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-16-plus-240::laion400m_e31 | 0.660 | 0.809 | 0.735 | 0.642 | 0.788 | 0.715 | 0.533 | 0.725 | 0.629 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-B-16-plus-240::laion400m_e32 | 0.662 | 0.811 | 0.736 | 0.644 | 0.791 | 0.718 | 0.535 | 0.727 | 0.631 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-L-14::openai | 0.610 | 0.793 | 0.702 | 0.599 | 0.767 | 0.683 | 0.494 | 0.717 | 0.605 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-L-14::laion400m_e31 | 0.680 | 0.821 | 0.750 | 0.675 | 0.806 | 0.741 | 0.570 | 0.751 | 0.661 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-L-14::laion400m_e32 | 0.680 | 0.821 | 0.751 | 0.675 | 0.806 | 0.740 | 0.570 | 0.751 | 0.661 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-L-14::laion2b_s32b_b82k | 0.711 | 0.840 | 0.775 | 0.712 | 0.824 | 0.768 | 0.620 | 0.789 | 0.704 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-L-14-336::openai | 0.616 | 0.812 | 0.714 | 0.629 | 0.779 | 0.704 | 0.533 | 0.741 | 0.637 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-H-14::laion2b_s32b_b79k | **0.734** | **0.861** | **0.797** | **0.746** | **0.856** | **0.801** | **0.657** | **0.823** | **0.740** | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +| ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | ++----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ Zero-shot classification ------------------------ From 0265b35417b6da94a5100a47fcdf3464f1431bdc Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 22:53:30 +0800 Subject: [PATCH 21/34] docs: highlight classification --- docs/user-guides/benchmark.rst | 110 ++++++++++++++++----------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index b1f358054..f140cb5d0 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -146,58 +146,58 @@ In zero-shot classification benchmark, each model is evaluated on the following The results are shown in the following table. For each dataset, we report the top-1 accuracy, which is whether the top-1 retrieved class of a image matches its true class. -+----------------------------------+------------+---------+----------------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ -| Model | ImageNetV2 | VOC2007 | VTAB natural | VTAB specialized | VTAB structured | -| | | +------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| | | | Caltech101 | CIFAR-100 | DTD | Flowers102 | Pets | Sun397 | SVHN | EuroSAT | Resisc45 | Patch Camelyon | Retinopathy | Clevr/count | Clevr/distance | dSprites/location | dSprites/orientation | SmallNORB/azimuth | SmallNORB/elevation | DMLab | KITTI/distance | -+==================================+============+=========+============+===========+=======+============+=======+========+=======+=========+==========+================+=============+=============+================+===================+======================+===================+=====================+=======+================+ -| RN50::openai | 0.529 | 0.650 | 0.772 | 0.403 | 0.415 | 0.660 | 0.857 | 0.894 | 0.303 | 0.408 | 0.453 | 0.636 | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN50::yfcc15m | 0.214 | 0.215 | 0.402 | 0.116 | 0.122 | 0.167 | 0.174 | 0.127 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | 0.336 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN50::cc12m | 0.224 | 0.438 | 0.582 | 0.178 | 0.135 | 0.095 | 0.331 | 0.123 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN101::openai | 0.561 | 0.651 | 0.780 | 0.476 | 0.432 | 0.652 | 0.869 | 0.887 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN101::yfcc15m | 0.221 | 0.243 | 0.469 | 0.125 | 0.117 | 0.210 | 0.177 | 0.128 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN50x4::openai | 0.594 | 0.682 | 0.781 | 0.451 | 0.486 | 0.698 | 0.887 | 0.908 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN50x16::openai | 0.643 | 0.680 | 0.810 | 0.522 | 0.524 | 0.724 | 0.898 | 0.917 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| RN50x64::openai | 0.670 | 0.740 | 0.834 | 0.598 | 0.531 | 0.788 | 0.936 | 0.931 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-32::openai | 0.559 | 0.764 | 0.815 | 0.643 | 0.443 | 0.664 | 0.873 | 0.913 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | 0.127 | 0.193 | 0.274 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-32::laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.754 | 0.539 | 0.691 | 0.893 | 0.933 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-32::laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.678 | 0.521 | 0.659 | 0.856 | 0.918 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-32::laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.677 | 0.523 | 0.658 | 0.854 | 0.917 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-32::laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.755 | 0.557 | 0.716 | 0.909 | 0.937 | 0.410 | 0.482 | 0.610 | 0.598 | 0.734 | 0.153 | 0.189 | 0.029 | 0.034 | 0.062 | 0.113 | 0.159 | 0.262 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-16::openai | 0.619 | 0.783 | 0.819 | 0.669 | 0.449 | 0.712 | 0.890 | 0.924 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-16::laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.712 | 0.513 | 0.694 | 0.892 | 0.939 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | 0.245 | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-16::laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.712 | 0.513 | 0.692 | 0.892 | 0.939 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | 0.245 | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-16-plus-240::laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.733 | 0.555 | 0.706 | 0.904 | 0.940 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-B-16-plus-240::laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.738 | 0.555 | 0.711 | 0.902 | 0.940 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | 0.043 | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-L-14::openai | 0.698 | 0.783 | 0.835 | 0.758 | 0.554 | 0.792 | 0.932 | 0.937 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-L-14::laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.774 | 0.598 | 0.757 | 0.917 | 0.950 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-L-14::laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.774 | 0.605 | 0.756 | 0.919 | 0.950 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-L-14::laion2b_s32b_b82k | 0.677 | 0.805 | 0.851 | 0.833 | 0.629 | 0.758 | 0.932 | 0.958 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | 0.224 | 0.229 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-L-14-336::openai | 0.709 | 0.781 | 0.837 | 0.744 | 0.556 | 0.783 | 0.937 | 0.940 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-H-14::laion2b_s32b_b79k | 0.709 | 0.777 | 0.850 | 0.847 | 0.678 | 0.801 | 0.945 | 0.961 | 0.563 | 0.726 | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ -| ViT-g-14::laion2b_s12b_b42k | 0.696 | 0.811 | 0.851 | 0.839 | 0.682 | 0.776 | 0.943 | 0.962 | 0.603 | 0.648 | 0.718 | 0.560 | 0.580 | 0.332 | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | -+----------------------------------+------------+---------+------------+-----------+-------+------------+-------+--------+-------+---------+----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-------+----------------+ \ No newline at end of file ++----------------------------------+------------+-----------+-------------------------------------------------------------------------------------+------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ +| Model | ImageNetV2 | VOC2007 | VTAB natural | VTAB specialized | VTAB structured | +| | | +------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| | | | Caltech101 | CIFAR-100 | DTD | Flowers102 | Pets | Sun397 | SVHN | EuroSAT | Resisc45 | Patch Camelyon | Retinopathy | Clevr/count | Clevr/distance | dSprites/location | dSprites/orientation | SmallNORB/azimuth | SmallNORB/elevation | DMLab | KITTI/distance | ++==================================+============+===========+============+===========+===========+============+===========+===========+===========+===========+===========+================+=============+=============+================+===================+======================+===================+=====================+===========+================+ +| RN50::openai | 0.529 | 0.650 | 0.772 | 0.403 | 0.415 | 0.660 | 0.857 | 0.894 | 0.303 | 0.408 | 0.453 | **0.636** | 0.171 | 0.217 | 0.148 | 0.034 | 0.014 | 0.056 | 0.110 | 0.145 | 0.170 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN50::yfcc15m | 0.214 | 0.215 | 0.402 | 0.116 | 0.122 | 0.167 | 0.174 | 0.127 | 0.157 | 0.172 | 0.123 | 0.533 | 0.358 | 0.151 | 0.158 | 0.032 | 0.024 | 0.053 | 0.120 | 0.160 | **0.336** | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN50::cc12m | 0.224 | 0.438 | 0.582 | 0.178 | 0.135 | 0.095 | 0.331 | 0.123 | 0.102 | 0.148 | 0.117 | 0.535 | 0.293 | 0.184 | 0.222 | 0.031 | 0.025 | 0.047 | 0.096 | 0.161 | 0.155 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN101::openai | 0.561 | 0.651 | 0.780 | 0.476 | 0.432 | 0.652 | 0.869 | 0.887 | 0.226 | 0.314 | 0.547 | 0.583 | 0.280 | 0.242 | 0.130 | 0.031 | 0.021 | 0.054 | 0.111 | 0.139 | 0.263 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN101::yfcc15m | 0.221 | 0.243 | 0.469 | 0.125 | 0.117 | 0.210 | 0.177 | 0.128 | 0.137 | 0.151 | 0.099 | 0.479 | 0.584 | 0.109 | 0.159 | 0.031 | 0.019 | 0.055 | 0.097 | 0.153 | 0.252 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN50x4::openai | 0.594 | 0.682 | 0.781 | 0.451 | 0.486 | 0.698 | 0.887 | 0.908 | 0.367 | 0.335 | 0.532 | 0.569 | 0.318 | 0.205 | 0.082 | 0.031 | 0.026 | 0.056 | 0.108 | 0.162 | 0.233 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN50x16::openai | 0.643 | 0.680 | 0.810 | 0.522 | 0.524 | 0.724 | 0.898 | 0.917 | 0.409 | 0.433 | 0.589 | 0.625 | 0.715 | 0.195 | 0.213 | 0.030 | 0.026 | 0.050 | 0.116 | 0.146 | 0.229 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| RN50x64::openai | 0.670 | 0.740 | 0.834 | 0.598 | 0.531 | 0.788 | 0.936 | 0.931 | 0.481 | 0.577 | 0.628 | 0.539 | 0.073 | 0.227 | 0.200 | 0.034 | 0.025 | 0.056 | 0.125 | 0.158 | 0.311 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-32::openai | 0.559 | 0.764 | 0.815 | 0.643 | 0.443 | 0.664 | 0.873 | 0.913 | 0.135 | 0.504 | 0.537 | 0.623 | 0.447 | 0.232 | 0.164 | 0.037 | 0.024 | 0.061 | **0.127** | 0.193 | 0.274 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-32::laion2b_e16 | 0.573 | 0.788 | 0.831 | 0.754 | 0.539 | 0.691 | 0.893 | 0.933 | 0.388 | 0.503 | 0.619 | 0.506 | 0.195 | 0.192 | 0.167 | 0.031 | 0.024 | 0.052 | 0.110 | 0.189 | 0.176 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-32::laion400m_e31 | 0.523 | 0.731 | 0.818 | 0.678 | 0.521 | 0.659 | 0.856 | 0.918 | 0.220 | 0.470 | 0.510 | 0.549 | 0.259 | 0.155 | 0.161 | 0.033 | 0.021 | 0.053 | 0.117 | 0.173 | 0.122 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-32::laion400m_e32 | 0.523 | 0.733 | 0.817 | 0.677 | 0.523 | 0.658 | 0.854 | 0.917 | 0.223 | 0.476 | 0.510 | 0.548 | 0.240 | 0.153 | 0.161 | 0.033 | 0.021 | 0.054 | 0.117 | 0.173 | 0.118 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-32::laion2b_s34b_b79k | 0.581 | 0.791 | 0.839 | 0.755 | 0.557 | 0.716 | 0.909 | 0.937 | 0.410 | 0.482 | 0.610 | 0.598 | **0.734** | 0.153 | 0.189 | 0.029 | **0.034** | **0.062** | 0.113 | 0.159 | 0.262 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-16::openai | 0.619 | 0.783 | 0.819 | 0.669 | 0.449 | 0.712 | 0.890 | 0.924 | 0.313 | 0.559 | 0.582 | 0.507 | 0.036 | 0.209 | 0.158 | 0.030 | 0.023 | 0.053 | 0.122 | 0.155 | 0.263 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-16::laion400m_e31 | 0.594 | 0.767 | 0.838 | 0.712 | 0.513 | 0.694 | 0.892 | 0.939 | 0.380 | 0.503 | 0.585 | 0.593 | 0.062 | 0.289 | **0.245** | 0.031 | 0.030 | 0.059 | 0.100 | 0.152 | 0.200 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-16::laion400m_e32 | 0.597 | 0.768 | 0.837 | 0.712 | 0.513 | 0.692 | 0.892 | 0.939 | 0.385 | 0.501 | 0.585 | 0.598 | 0.077 | 0.287 | **0.245** | 0.032 | 0.029 | 0.060 | 0.099 | 0.151 | 0.183 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-16-plus-240::laion400m_e31 | 0.614 | 0.764 | 0.832 | 0.733 | 0.555 | 0.706 | 0.904 | 0.940 | 0.355 | 0.569 | 0.615 | 0.551 | 0.093 | 0.240 | 0.159 | 0.041 | 0.026 | 0.056 | 0.111 | 0.149 | 0.280 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-B-16-plus-240::laion400m_e32 | 0.615 | 0.764 | 0.833 | 0.738 | 0.555 | 0.711 | 0.902 | 0.940 | 0.362 | 0.581 | 0.613 | 0.551 | 0.095 | 0.238 | 0.160 | **0.043** | 0.027 | 0.054 | 0.110 | 0.148 | 0.281 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-L-14::openai | 0.698 | 0.783 | 0.835 | 0.758 | 0.554 | 0.792 | 0.932 | 0.937 | 0.571 | 0.626 | 0.633 | 0.520 | 0.733 | 0.194 | 0.161 | 0.032 | 0.023 | 0.045 | 0.115 | 0.163 | 0.218 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-L-14::laion400m_e31 | 0.654 | 0.758 | 0.839 | 0.774 | 0.598 | 0.757 | 0.917 | 0.950 | 0.378 | 0.632 | 0.671 | 0.487 | 0.058 | 0.242 | 0.149 | 0.030 | 0.026 | 0.053 | 0.109 | 0.186 | 0.200 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-L-14::laion400m_e32 | 0.654 | 0.756 | 0.839 | 0.774 | 0.605 | 0.756 | 0.919 | 0.950 | 0.380 | 0.622 | 0.675 | 0.493 | 0.061 | 0.243 | 0.149 | 0.030 | 0.026 | 0.053 | 0.110 | 0.186 | 0.203 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-L-14::laion2b_s32b_b82k | 0.677 | 0.805 | **0.851** | 0.833 | 0.629 | 0.758 | 0.932 | 0.958 | 0.459 | 0.646 | 0.668 | 0.563 | 0.116 | 0.312 | 0.161 | 0.032 | 0.020 | 0.056 | 0.108 | **0.224** | 0.229 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-L-14-336::openai | **0.709** | 0.781 | 0.837 | 0.744 | 0.556 | 0.783 | 0.937 | 0.940 | 0.560 | 0.615 | 0.638 | 0.608 | 0.733 | 0.200 | 0.158 | 0.032 | 0.024 | 0.046 | 0.113 | 0.158 | 0.262 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-H-14::laion2b_s32b_b79k | **0.709** | 0.777 | 0.850 | **0.847** | 0.678 | **0.801** | **0.945** | 0.961 | 0.563 | **0.726** | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +| ViT-g-14::laion2b_s12b_b42k | 0.696 | **0.811** | **0.851** | 0.839 | **0.682** | 0.776 | 0.943 | **0.962** | **0.603** | 0.648 | 0.718 | 0.560 | 0.580 | **0.332** | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ \ No newline at end of file From 8df3ed5dd4d0b52cf22bd0830e89dbe19745b075 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 23:05:15 +0800 Subject: [PATCH 22/34] docs: remove redundancy --- docs/user-guides/server.md | 68 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/docs/user-guides/server.md b/docs/user-guides/server.md index be05867fa..8043e282f 100644 --- a/docs/user-guides/server.md +++ b/docs/user-guides/server.md @@ -63,39 +63,41 @@ The procedure and UI of ONNX and TensorRT runtime would look the same as Pytorch The various `CLIP` models implemented in the [OpenAI](https://github.com/openai/CLIP), [OpenCLIP](https://github.com/mlfoundations/open_clip), and [MultilingualCLIP](https://github.com/FreddeFrallan/Multilingual-CLIP) are supported. `ViT-B-32::openai` is used as the default model in all runtimes. Due to the limitation of some runtimes, not every runtime supports all models. -Please also note that **different models give different sizes of output dimensions**. This will affect your downstream applications. For example, switching the model from one to another make your embedding incomparable, which breaks the downstream applications. Below is a list of supported models of each runtime and its corresponding size. We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with `batch_size=8` using PyTorch runtime. - -| Model | PyTorch | ONNX | TensorRT | Output Dimension | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | -|---------------------------------------|---------|------|----------|------------------|-----------------|---------------------|----------------------| -| RN50::openai | ✅ | ✅ | ✅ | 1024 | 244 | 2.99 | 1.36 | -| RN50::yfcc15m | ✅ | ✅ | ✅ | 1024 | 389 | 2.86 | 1.36 | -| RN50::cc12m | ✅ | ✅ | ✅ | 1024 | 389 | 2.84 | 1.36 | -| RN101::openai | ✅ | ✅ | ✅ | 512 | 278 | 3.05 | 1.40 | -| RN101::yfcc15m | ✅ | ✅ | ✅ | 512 | 457 | 2.88 | 1.40 | -| RN50x4::openai | ✅ | ✅ | ✅ | 640 | 402 | 3.23 | 1.63 | -| RN50x16::openai | ✅ | ✅ | ❌ | 768 | 631 | 3.63 | 2.02 | -| RN50x64::openai | ✅ | ✅ | ❌ | 1024 | 1291 | 4.08 | 2.98 | -| ViT-B-32::openai | ✅ | ✅ | ✅ | 512 | 338 | 3.20 | 1.40 | -| ViT-B-32::laion2b_e16 | ✅ | ✅ | ✅ | 512 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion400m_e31 | ✅ | ✅ | ✅ | 512 | 577 | 2.93 | 1.40 | -| ViT-B-32::laion400m_e32 | ✅ | ✅ | ✅ | 512 | 577 | 2.94 | 1.40 | -| ViT-B-32::laion2b-s34b-b79k | ✅ | ✅ | ❌ | 512 | 577 | 2.94 | 1.40 | -| ViT-B-16::openai | ✅ | ✅ | ✅ | 512 | 335 | 3.20 | 1.44 | -| ViT-B-16::laion400m_e31 | ✅ | ✅ | ✅ | 512 | 571 | 2.93 | 1.44 | -| ViT-B-16::laion400m_e32 | ✅ | ✅ | ✅ | 512 | 571 | 2.94 | 1.44 | -| ViT-B-16-plus-240::laion400m_e31 | ✅ | ✅ | 🚧 | 640 | 795 | 3.03 | 1.59 | -| ViT-B-16-plus-240::laion400m_e32 | ✅ | ✅ | 🚧 | 640 | 795 | 3.03 | 1.59 | -| ViT-L-14::openai | ✅ | ✅ | ❌ | 768 | 890 | 3.66 | 2.04 | -| ViT-L-14::laion400m_e31 | ✅ | ✅ | ❌ | 768 | 1631 | 3.43 | 2.03 | -| ViT-L-14::laion400m_e32 | ✅ | ✅ | ❌ | 768 | 1631 | 3.42 | 2.03 | -| ViT-L-14::laion2b-s32b-b82k | ✅ | ✅ | ❌ | 768 | 1631 | 3.43 | 2.03 | -| ViT-L-14-336::openai | ✅ | ✅ | ❌ | 768 | 891 | 3.74 | 2.23 | -| ViT-H-14::laion2b-s32b-b79k | ✅ | ✅ | ❌ | 1024 | 3762 | 4.45 | 3.26 | -| ViT-g-14::laion2b-s12b-b42k | ✅ | ✅ | ❌ | 1024 | 5214 | 5.16 | 4.00 | -| M-CLIP/LABSE-Vit-L-14 | ✅ | ✅ | ❌ | 768 | 3609 | 4.30 | 4.70 | -| M-CLIP/XLM-Roberta-Large-Vit-B-32 | ✅ | ✅ | 🚧 | 512 | 4284 | 5.37 | 1.68 | -| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅ | ✅ | 🚧 | 640 | 4293 | 4.30 | 4.13 | -| M-CLIP/XLM-Roberta-Large-Vit-L-14 | ✅ | ✅ | ❌ | 768 | 4293 | 4.30 | 4.97 | +Please also note that **different models give different sizes of output dimensions**. This will affect your downstream applications. For example, switching the model from one to another make your embedding incomparable, which breaks the downstream applications. Below is a list of supported models of each runtime and its corresponding size. + +For more details about the models and how to select the best model for your application, please refer to the [CLIP benchmark page](benchmark.rst). + +| Model | PyTorch | ONNX | TensorRT | Output Dimension | +|---------------------------------------|---------|------|----------|------------------| +| RN50::openai | ✅ | ✅ | ✅ | 1024 | +| RN50::yfcc15m | ✅ | ✅ | ✅ | 1024 | +| RN50::cc12m | ✅ | ✅ | ✅ | 1024 | +| RN101::openai | ✅ | ✅ | ✅ | 512 | +| RN101::yfcc15m | ✅ | ✅ | ✅ | 512 | +| RN50x4::openai | ✅ | ✅ | ✅ | 640 | +| RN50x16::openai | ✅ | ✅ | ❌ | 768 | +| RN50x64::openai | ✅ | ✅ | ❌ | 1024 | +| ViT-B-32::openai | ✅ | ✅ | ✅ | 512 | +| ViT-B-32::laion2b_e16 | ✅ | ✅ | ✅ | 512 | +| ViT-B-32::laion400m_e31 | ✅ | ✅ | ✅ | 512 | +| ViT-B-32::laion400m_e32 | ✅ | ✅ | ✅ | 512 | +| ViT-B-32::laion2b-s34b-b79k | ✅ | ✅ | ❌ | 512 | +| ViT-B-16::openai | ✅ | ✅ | ✅ | 512 | +| ViT-B-16::laion400m_e31 | ✅ | ✅ | ✅ | 512 | +| ViT-B-16::laion400m_e32 | ✅ | ✅ | ✅ | 512 | +| ViT-B-16-plus-240::laion400m_e31 | ✅ | ✅ | 🚧 | 640 | +| ViT-B-16-plus-240::laion400m_e32 | ✅ | ✅ | 🚧 | 640 | +| ViT-L-14::openai | ✅ | ✅ | ❌ | 768 | +| ViT-L-14::laion400m_e31 | ✅ | ✅ | ❌ | 768 | +| ViT-L-14::laion400m_e32 | ✅ | ✅ | ❌ | 768 | +| ViT-L-14::laion2b-s32b-b82k | ✅ | ✅ | ❌ | 768 | +| ViT-L-14-336::openai | ✅ | ✅ | ❌ | 768 | +| ViT-H-14::laion2b-s32b-b79k | ✅ | ✅ | ❌ | 1024 | +| ViT-g-14::laion2b-s12b-b42k | ✅ | ✅ | ❌ | 1024 | +| M-CLIP/LABSE-Vit-L-14 | ✅ | ✅ | ❌ | 768 | +| M-CLIP/XLM-Roberta-Large-Vit-B-32 | ✅ | ✅ | 🚧 | 512 | +| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅ | ✅ | 🚧 | 640 | +| M-CLIP/XLM-Roberta-Large-Vit-L-14 | ✅ | ✅ | ❌ | 768 | ✅ = Supported — 🚧 = Working in progress — ❌ = Not supported From 378a5005241f9fadb0c8506fec6e995f88b7b3ad Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 23:20:32 +0800 Subject: [PATCH 23/34] docs: add links --- docs/user-guides/benchmark.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index f140cb5d0..5ea36b77a 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -9,7 +9,7 @@ They can be used as a guide to choose the best model for your application. Basic statistics ---------------- -We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. +In the table belowe, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. +----------------------------------------+------------------+----------------------+-----------------------+ | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | @@ -76,7 +76,7 @@ We include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) Zero-shot retrieval ------------------- -In zero-shot retrieval benchmark, each model is evaluated on the following datasets: COCO Caption, Flickr8k and Flickr30k. +In zero-shot retrieval benchmark, each model is evaluated on the following datasets: `COCO Caption`, `Flickr8k` and `Flickr30k`. For the above datasets, there are five corresponding description sentences for each image written by humans. The results are reported in terms of top-5 text-to-image retrieval recall, top-5 image-to-text retrieval recall and their averages. More specifically, the top-5 text-to-image retrieval recall for each retrieved image is either 1 or 0. @@ -142,7 +142,7 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r Zero-shot classification ------------------------ -In zero-shot classification benchmark, each model is evaluated on the following datasets: ImageNetV2, VOC2007 and 19 VTAB datasets. +In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2`, `VOC2007` and 19 `VTAB datasets`. The results are shown in the following table. For each dataset, we report the top-1 accuracy, which is whether the top-1 retrieved class of a image matches its true class. From 8c3ae0896d24e436e7b0f1f624a0bec54c109a28 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 23:28:17 +0800 Subject: [PATCH 24/34] fix: link --- docs/user-guides/benchmark.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 5ea36b77a..2727420e9 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -76,7 +76,7 @@ In the table belowe, we include the disk usage (in delta) and the peak RAM and V Zero-shot retrieval ------------------- -In zero-shot retrieval benchmark, each model is evaluated on the following datasets: `COCO Caption`, `Flickr8k` and `Flickr30k`. +In zero-shot retrieval benchmark, each model is evaluated on the following datasets: `COCO Caption `_, `Flickr8k `_ and `Flickr30k `_. For the above datasets, there are five corresponding description sentences for each image written by humans. The results are reported in terms of top-5 text-to-image retrieval recall, top-5 image-to-text retrieval recall and their averages. More specifically, the top-5 text-to-image retrieval recall for each retrieved image is either 1 or 0. @@ -142,7 +142,7 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r Zero-shot classification ------------------------ -In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2`, `VOC2007` and 19 `VTAB datasets`. +In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2 `_, `VOC2007 `_ and 19 `VTAB datasets `_. The results are shown in the following table. For each dataset, we report the top-1 accuracy, which is whether the top-1 retrieved class of a image matches its true class. From e232e321712d289475493096cd0e4576a5f7efc1 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Fri, 30 Sep 2022 23:51:37 +0800 Subject: [PATCH 25/34] docs: update section --- docs/user-guides/benchmark.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 2727420e9..2e0615eaf 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -9,7 +9,7 @@ They can be used as a guide to choose the best model for your application. Basic statistics ---------------- -In the table belowe, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. +In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. +----------------------------------------+------------------+----------------------+-----------------------+ | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | @@ -73,8 +73,12 @@ In the table belowe, we include the disk usage (in delta) and the peak RAM and V | M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | +----------------------------------------+------------------+----------------------+-----------------------+ + +CLIP benchmark +-------------- + Zero-shot retrieval -------------------- ++++++++++++++++++++ In zero-shot retrieval benchmark, each model is evaluated on the following datasets: `COCO Caption `_, `Flickr8k `_ and `Flickr30k `_. For the above datasets, there are five corresponding description sentences for each image written by humans. @@ -140,7 +144,7 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r +----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ Zero-shot classification ------------------------- +++++++++++++++++++++++++ In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2 `_, `VOC2007 `_ and 19 `VTAB datasets `_. The results are shown in the following table. From 4b81a9f2f1d9bb1eef7943922fd30ae2bea9b09d Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sat, 8 Oct 2022 17:27:32 +0800 Subject: [PATCH 26/34] docs: datasets description --- docs/user-guides/benchmark.rst | 46 ++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 2e0615eaf..462e8c2df 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -146,7 +146,7 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r Zero-shot classification ++++++++++++++++++++++++ -In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2 `_, `VOC2007 `_ and 19 `VTAB datasets `_. +In zero-shot classification benchmark, each model is evaluated on the following datasets: `ImageNetV2 `_, `VOC2007 `_ and 19 `VTAB datasets `_. The results are shown in the following table. For each dataset, we report the top-1 accuracy, which is whether the top-1 retrieved class of a image matches its true class. @@ -204,4 +204,46 @@ For each dataset, we report the top-1 accuracy, which is whether the top-1 retri | ViT-H-14::laion2b_s32b_b79k | **0.709** | 0.777 | 0.850 | **0.847** | 0.678 | **0.801** | **0.945** | 0.961 | 0.563 | **0.726** | 0.699 | 0.542 | 0.297 | 0.268 | 0.169 | 0.032 | 0.027 | 0.054 | 0.111 | 0.140 | 0.110 | +----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ | ViT-g-14::laion2b_s12b_b42k | 0.696 | **0.811** | **0.851** | 0.839 | **0.682** | 0.776 | 0.943 | **0.962** | **0.603** | 0.648 | 0.718 | 0.560 | 0.580 | **0.332** | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | -+----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ \ No newline at end of file ++----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ + + +Appendix: Datasets description +------------------------------ + +* **Caltech101**: The task consists in classifying pictures of objects (101 classes plus a background clutter class), including animals, airplanes, chairs, or scissors. The image size varies, but it typically ranges from 200-300 pixels per edge. + +* **CIFAR-100**: The task consists in classifying natural images (100 classes, with 500 training images each). Some examples include apples, bottles, dinosaurs, and bicycles. The image size is 32x32. + +* **DTD**: The task consists in classifying images of textural patterns (47 classes, with 120 training images each). Some of the textures are banded, bubbly, meshed, lined, or porous. The image size ranges between 300x300 and 640x640 pixels. + +* **Flowers102**: The task consists in classifying images of flowers present in the UK (102 classes, with between 40 and 248 training images per class). Azalea, Californian Poppy, Sunflower, or Petunia are some examples. Each image dimension has at least 500 pixels. + +* **Pets**: The task consists in classifying pictures of cat and dog breeds (37 classes with around 200 images each), including Persian cat, Chihuahua dog, English Setter dog, or Bengal cat. Images dimensions are typically 200 pixels or larger. + +* **Sun397**: The Sun397 task is a scenery benchmark with 397 classes and, at least, 100 images per class. Classes have a hierarchy structure, and include cathedral, staircase, shelter, river, or archipelago. The images are (colour) 200x200 pixels or larger. + +* **SVHN**: This task consists in classifying images of Google's street-view house numbers (10 classes, with more than 1000 training images each). The image size is 32x32 pixels. + +* **EuroSAT**: The task consists in classifying Sentinel-2 satellite images into 10 different types of land use (Residential, Industrial, River, Highway, etc). The spatial resolution corresponds to 10 meters per pixel, and the image size is 64x64 pixels. + +* **Resisc45**: The Remote Sensing Image Scene Classification (RESISC) dataset is a scene classification task from remote sensing images. There are 45 classes, containing 700 images each, including tennis court, ship, island, lake, parking lot, sparse residential, or stadium. The image size is RGB 256x256 pixels. + +* **Patch Camelyon**: The Patch Camelyon dataset contains 327,680 images of histopathologic scans of lymph node sections. The classification task consists in predicting the presence of metastatic tissue in given image (i.e., two classes). All images are 96x96 pixels. + +* **Retinopathy**: The Diabetic Retinopathy dataset consists of image-label pairs with high-resolution retina images, and labels that indicate the presence of Diabetic Retinopahy (DR) in a 0-4 scale (No DR, Mild, Moderate, Severe, or Proliferative DR). + +* **Clevr/count**: CLEVR is a visual question and answer dataset designed to evaluate algorithmic visual reasoning. We use just the images from this dataset, and create a synthetic task by setting the label equal to the number of objects in the images. + +* **Clevr/distance**: Another synthetic task we create from CLEVR consists of predicting the depth of the closest object in the image from the camera. The depths are bucketed into size bins. + +* **dSprites/location**: The dSprites dataset was originally designed to asses disentanglement properties of unsupervised learning algorithms. In particular, each image is a 2D shape where six factors are controlled: color, shape, scale, rotation, and (x,y) center coordinates. Images have 64x64 black-and-white pixels. This task consists in predicting the x (horizontal) coordinate of the object. The locations are bucketed into 16 bins. + +* **dSprites/orientation**: We create another task from dSprites consists in predicting the orientation of each object, bucketed into 16 bins. + +* **SmallNORB/azimuth**: The Small NORB dataset contains images of 3D-toys from 50 classes, including animals, human figures, airplanes, trucks, and cars. The image size is 640x480 pixels. In this case, we define labels depending on the azimuth (angle of horizontal deviation), in intervals of 20 degrees (18 classes). + +* **SmallNORB/elevation**: Another synthetic task we create from Small NORB consists in predicting the elevation in the image. There are 9 classes, corresponding to 9 different elevations ranging from 30 to 70 degrees, in intervals of 5 degrees. + +* **DMLab**: The DMLab (DeepMind Lab) is a set of control environments focused on 3D navigation and puzzle-solving tasks. The Dmlab dataset contains frames observed by the agent acting in the DeepMind Lab environment, which are annotated by the distance between the agent and various objects present in the environment. The goal is to evaluate the ability of a visual model to reason about distances from the visual input in 3D environments. The Dmlab dataset consists of 360x480 color images in 6 classes. The classes are {close, far, very far} x {positive reward, negative reward} respectively. + +* **KITTI-Dist**: The KITTI task consists in predicting the (binned) depth to the vehicle (car, van, or truck) in the image. There are 4 bins / classes. \ No newline at end of file From d07b5d7948198b0e33cf09ab70a37317b204ab04 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sat, 8 Oct 2022 22:37:13 +0800 Subject: [PATCH 27/34] docs: add datasets description --- docs/user-guides/benchmark.rst | 72 +++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 462e8c2df..1e617575d 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -2,6 +2,7 @@ CLIP Benchmark ============== In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. +We fork the `CLIP benchmark repo `_, slightly modify the codebase and apply it to all Vision Transformers (ViT) and ResNet (RN) CLIP models. You can find the benchmark results in the following tables. The best results are highlighted in bold. They can be used as a guide to choose the best model for your application. @@ -143,6 +144,10 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r | ViT-g-14::laion2b_s12b_b42k | 0.724 | 0.853 | 0.788 | 0.730 | 0.846 | 0.788 | 0.639 | 0.806 | 0.722 | +----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ +From the table, we observe that the ViT models outperform the RN models in general. +More specifically, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model achieve the best and second-best results on all zero-shot retrieval tasks. +For ViT models, the results of the same base model are better on those pre-trained with larger datasets (e.g., ViT-B-32::openai vs ViT-B-32::laion400m_e31 vs ViT-B-32::laion2b-s34b-b79k). + Zero-shot classification ++++++++++++++++++++++++ @@ -206,44 +211,73 @@ For each dataset, we report the top-1 accuracy, which is whether the top-1 retri | ViT-g-14::laion2b_s12b_b42k | 0.696 | **0.811** | **0.851** | 0.839 | **0.682** | 0.776 | 0.943 | **0.962** | **0.603** | 0.648 | 0.718 | 0.560 | 0.580 | **0.332** | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | +----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ +From the table, we observe that the ViT models still outperform the RN models in most tasks, except for the Patch Camelyon dataset where RN50::openai has the best top-1 accuracy of 0.636, and the KITTI/distance dataset where RN50::yfcc15m has the best result of 0.336. +Similar to retrieval results, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model still have the best or close to the best results on 12/21 zero-shot classification tasks. +All models tend to perform well on ImageNetV2, VOC2007, VTAB natural and VTAB specialized (except for Retinopathy) datasets, whereas they perform poorly on VTAB structured datasets. +We do not observe any significant difference between the ViT models of the same base model. + +Select the best model ++++++++++++++++++++++ + +In general, you can select the best model for your application from different perspectives: disk usage, peak RAM and VRAM usages, and most importantly, the performance. +Based on our experiments, we recommend the ViT models over the RN models for most general applications. +More specifically, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model should be first considered since they have the best or close to the best performance in most cases. +However, you should choose the model that best fits your requirements. +For example, if you are labelling images for Diabetic Retinopathy, you should probably select the ViT-B-32::laion2b_s34b_b79k model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. Appendix: Datasets description ------------------------------ -* **Caltech101**: The task consists in classifying pictures of objects (101 classes plus a background clutter class), including animals, airplanes, chairs, or scissors. The image size varies, but it typically ranges from 200-300 pixels per edge. +* **COCO Caption**: The dataset contains over one and a half million captions describing over 330,000 images. For the training and validation images, five independent human generated captions are provided. + +* **Flickr 8k**: The dataset consists of 8,000 images that are each paired with five different captions which provide clear descriptions of the salient entities and events. The images were chosen from six different Flickr groups, and tend not to contain any well-known people or locations, but were manually selected to depict a variety of scenes and situations. + +* **Flickr 30k**: The dataset is an extension of the Flickr 8k Dataset. It consists of 158,915 crowd-sourced captions describing 31,783 images. + +* **ImageNetV2**: ImageNetV2 contains three test sets with 10,000 new images each. Importantly, these test sets were sampled after a decade of progress on the original ImageNet dataset. This makes the new test data independent of existing models and guarantees that the accuracy scores are not affected by adaptive overfitting. + +* **VOC2007**: The training data provided consists of a set of images; each image has an annotation file giving a bounding box and object class label for each object in one of the twenty classes present in the image. Note that multiple objects from multiple classes may be present in the same image. + +* **VTAB natural group**: The natural group represents classical vision problems. These tasks contain natural images captured using standard cameras. The classes may represent generic, fine-grained, or abstract objects. + + * **Caltech101**: The task consists in classifying pictures of objects (101 classes plus a background clutter class), including animals, airplanes, chairs, or scissors. The image size varies, but it typically ranges from 200-300 pixels per edge. + + * **CIFAR-100**: The task consists in classifying natural images (100 classes, with 500 training images each). Some examples include apples, bottles, dinosaurs, and bicycles. The image size is 32x32. + + * **DTD**: The task consists in classifying images of textural patterns (47 classes, with 120 training images each). Some of the textures are banded, bubbly, meshed, lined, or porous. The image size ranges between 300x300 and 640x640 pixels. -* **CIFAR-100**: The task consists in classifying natural images (100 classes, with 500 training images each). Some examples include apples, bottles, dinosaurs, and bicycles. The image size is 32x32. + * **Flowers102**: The task consists in classifying images of flowers present in the UK (102 classes, with between 40 and 248 training images per class). Azalea, Californian Poppy, Sunflower, or Petunia are some examples. Each image dimension has at least 500 pixels. -* **DTD**: The task consists in classifying images of textural patterns (47 classes, with 120 training images each). Some of the textures are banded, bubbly, meshed, lined, or porous. The image size ranges between 300x300 and 640x640 pixels. + * **Pets**: The task consists in classifying pictures of cat and dog breeds (37 classes with around 200 images each), including Persian cat, Chihuahua dog, English Setter dog, or Bengal cat. Images dimensions are typically 200 pixels or larger. -* **Flowers102**: The task consists in classifying images of flowers present in the UK (102 classes, with between 40 and 248 training images per class). Azalea, Californian Poppy, Sunflower, or Petunia are some examples. Each image dimension has at least 500 pixels. + * **Sun397**: The Sun397 task is a scenery benchmark with 397 classes and, at least, 100 images per class. Classes have a hierarchy structure, and include cathedral, staircase, shelter, river, or archipelago. The images are (colour) 200x200 pixels or larger. -* **Pets**: The task consists in classifying pictures of cat and dog breeds (37 classes with around 200 images each), including Persian cat, Chihuahua dog, English Setter dog, or Bengal cat. Images dimensions are typically 200 pixels or larger. + * **SVHN**: This task consists in classifying images of Google's street-view house numbers (10 classes, with more than 1000 training images each). The image size is 32x32 pixels. -* **Sun397**: The Sun397 task is a scenery benchmark with 397 classes and, at least, 100 images per class. Classes have a hierarchy structure, and include cathedral, staircase, shelter, river, or archipelago. The images are (colour) 200x200 pixels or larger. +* **VTAB specialized group**: The specialized group also contains images of the world, but captured through specialist equipment. These images have different invariances to those in the specialized tasks. Nonetheless, humans recognize the structures therein, thus generic visual representations should also capture the visual concepts. It two sub-groups: remote sensing, and medical. -* **SVHN**: This task consists in classifying images of Google's street-view house numbers (10 classes, with more than 1000 training images each). The image size is 32x32 pixels. + * **EuroSAT**: The task consists in classifying Sentinel-2 satellite images into 10 different types of land use (Residential, Industrial, River, Highway, etc). The spatial resolution corresponds to 10 meters per pixel, and the image size is 64x64 pixels. -* **EuroSAT**: The task consists in classifying Sentinel-2 satellite images into 10 different types of land use (Residential, Industrial, River, Highway, etc). The spatial resolution corresponds to 10 meters per pixel, and the image size is 64x64 pixels. + * **Resisc45**: The Remote Sensing Image Scene Classification (RESISC) dataset is a scene classification task from remote sensing images. There are 45 classes, containing 700 images each, including tennis court, ship, island, lake, parking lot, sparse residential, or stadium. The image size is RGB 256x256 pixels. -* **Resisc45**: The Remote Sensing Image Scene Classification (RESISC) dataset is a scene classification task from remote sensing images. There are 45 classes, containing 700 images each, including tennis court, ship, island, lake, parking lot, sparse residential, or stadium. The image size is RGB 256x256 pixels. + * **Patch Camelyon**: The Patch Camelyon dataset contains 327,680 images of histopathologic scans of lymph node sections. The classification task consists in predicting the presence of metastatic tissue in given image (i.e., two classes). All images are 96x96 pixels. -* **Patch Camelyon**: The Patch Camelyon dataset contains 327,680 images of histopathologic scans of lymph node sections. The classification task consists in predicting the presence of metastatic tissue in given image (i.e., two classes). All images are 96x96 pixels. + * **Retinopathy**: The Diabetic Retinopathy dataset consists of image-label pairs with high-resolution retina images, and labels that indicate the presence of Diabetic Retinopahy (DR) in a 0-4 scale (No DR, Mild, Moderate, Severe, or Proliferative DR). -* **Retinopathy**: The Diabetic Retinopathy dataset consists of image-label pairs with high-resolution retina images, and labels that indicate the presence of Diabetic Retinopahy (DR) in a 0-4 scale (No DR, Mild, Moderate, Severe, or Proliferative DR). +* **VTAB structured group**: The structured group assesses comprehension of the structure of a scene, for example, object counting, or 3D depth prediction. Most of these tasks are generated from simulated environments, whose structure is easy for a human to determine, but whose domain differs greatly to datasets like ImageNet. These tasks are intended as a step towards useful representations for perceptual control. -* **Clevr/count**: CLEVR is a visual question and answer dataset designed to evaluate algorithmic visual reasoning. We use just the images from this dataset, and create a synthetic task by setting the label equal to the number of objects in the images. + * **Clevr/count**: CLEVR is a visual question and answer dataset designed to evaluate algorithmic visual reasoning. We use just the images from this dataset, and create a synthetic task by setting the label equal to the number of objects in the images. -* **Clevr/distance**: Another synthetic task we create from CLEVR consists of predicting the depth of the closest object in the image from the camera. The depths are bucketed into size bins. + * **Clevr/distance**: Another synthetic task we create from CLEVR consists of predicting the depth of the closest object in the image from the camera. The depths are bucketed into size bins. -* **dSprites/location**: The dSprites dataset was originally designed to asses disentanglement properties of unsupervised learning algorithms. In particular, each image is a 2D shape where six factors are controlled: color, shape, scale, rotation, and (x,y) center coordinates. Images have 64x64 black-and-white pixels. This task consists in predicting the x (horizontal) coordinate of the object. The locations are bucketed into 16 bins. + * **dSprites/location**: The dSprites dataset was originally designed to asses disentanglement properties of unsupervised learning algorithms. In particular, each image is a 2D shape where six factors are controlled: color, shape, scale, rotation, and (x,y) center coordinates. Images have 64x64 black-and-white pixels. This task consists in predicting the x (horizontal) coordinate of the object. The locations are bucketed into 16 bins. -* **dSprites/orientation**: We create another task from dSprites consists in predicting the orientation of each object, bucketed into 16 bins. + * **dSprites/orientation**: We create another task from dSprites consists in predicting the orientation of each object, bucketed into 16 bins. -* **SmallNORB/azimuth**: The Small NORB dataset contains images of 3D-toys from 50 classes, including animals, human figures, airplanes, trucks, and cars. The image size is 640x480 pixels. In this case, we define labels depending on the azimuth (angle of horizontal deviation), in intervals of 20 degrees (18 classes). + * **SmallNORB/azimuth**: The Small NORB dataset contains images of 3D-toys from 50 classes, including animals, human figures, airplanes, trucks, and cars. The image size is 640x480 pixels. In this case, we define labels depending on the azimuth (angle of horizontal deviation), in intervals of 20 degrees (18 classes). -* **SmallNORB/elevation**: Another synthetic task we create from Small NORB consists in predicting the elevation in the image. There are 9 classes, corresponding to 9 different elevations ranging from 30 to 70 degrees, in intervals of 5 degrees. + * **SmallNORB/elevation**: Another synthetic task we create from Small NORB consists in predicting the elevation in the image. There are 9 classes, corresponding to 9 different elevations ranging from 30 to 70 degrees, in intervals of 5 degrees. -* **DMLab**: The DMLab (DeepMind Lab) is a set of control environments focused on 3D navigation and puzzle-solving tasks. The Dmlab dataset contains frames observed by the agent acting in the DeepMind Lab environment, which are annotated by the distance between the agent and various objects present in the environment. The goal is to evaluate the ability of a visual model to reason about distances from the visual input in 3D environments. The Dmlab dataset consists of 360x480 color images in 6 classes. The classes are {close, far, very far} x {positive reward, negative reward} respectively. + * **DMLab**: The DMLab (DeepMind Lab) is a set of control environments focused on 3D navigation and puzzle-solving tasks. The Dmlab dataset contains frames observed by the agent acting in the DeepMind Lab environment, which are annotated by the distance between the agent and various objects present in the environment. The goal is to evaluate the ability of a visual model to reason about distances from the visual input in 3D environments. The Dmlab dataset consists of 360x480 color images in 6 classes. The classes are {close, far, very far} x {positive reward, negative reward} respectively. -* **KITTI-Dist**: The KITTI task consists in predicting the (binned) depth to the vehicle (car, van, or truck) in the image. There are 4 bins / classes. \ No newline at end of file + * **KITTI-Dist**: The KITTI task consists in predicting the (binned) depth to the vehicle (car, van, or truck) in the image. There are 4 bins / classes. From 67605dea49d5f4febca80458915927a4e5fb9cc6 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sat, 8 Oct 2022 22:43:23 +0800 Subject: [PATCH 28/34] docs: format --- docs/user-guides/benchmark.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 1e617575d..98114a779 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -145,8 +145,8 @@ The top-5 image-to-text retrieval recall for each image is the number of top-5 r +----------------------------------+---------------+---------------+-----------+---------------+---------------+-----------+---------------+---------------+-----------+ From the table, we observe that the ViT models outperform the RN models in general. -More specifically, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model achieve the best and second-best results on all zero-shot retrieval tasks. -For ViT models, the results of the same base model are better on those pre-trained with larger datasets (e.g., ViT-B-32::openai vs ViT-B-32::laion400m_e31 vs ViT-B-32::laion2b-s34b-b79k). +More specifically, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model achieve the best and second-best results on all zero-shot retrieval tasks. +For ViT models, the results of the same base model are better on those pre-trained with larger datasets (e.g., ``ViT-B-32::openai`` vs ``ViT-B-32::laion400m_e31`` vs ``ViT-B-32::laion2b-s34b-b79k``). Zero-shot classification ++++++++++++++++++++++++ @@ -211,8 +211,8 @@ For each dataset, we report the top-1 accuracy, which is whether the top-1 retri | ViT-g-14::laion2b_s12b_b42k | 0.696 | **0.811** | **0.851** | 0.839 | **0.682** | 0.776 | 0.943 | **0.962** | **0.603** | 0.648 | 0.718 | 0.560 | 0.580 | **0.332** | 0.175 | 0.036 | 0.031 | 0.060 | 0.115 | 0.190 | 0.138 | +----------------------------------+------------+-----------+------------+-----------+-----------+------------+-----------+-----------+-----------+-----------+-----------+----------------+-------------+-------------+----------------+-------------------+----------------------+-------------------+---------------------+-----------+----------------+ -From the table, we observe that the ViT models still outperform the RN models in most tasks, except for the Patch Camelyon dataset where RN50::openai has the best top-1 accuracy of 0.636, and the KITTI/distance dataset where RN50::yfcc15m has the best result of 0.336. -Similar to retrieval results, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model still have the best or close to the best results on 12/21 zero-shot classification tasks. +From the table, we observe that the ViT models still outperform the RN models in most tasks, except for the Patch Camelyon dataset where ``RN50::openai`` has the best top-1 accuracy of 0.636, and the KITTI/distance dataset where ``RN50::yfcc15m`` has the best result of 0.336. +Similar to retrieval results, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model still have the best or close to the best results on 12/21 zero-shot classification tasks. All models tend to perform well on ImageNetV2, VOC2007, VTAB natural and VTAB specialized (except for Retinopathy) datasets, whereas they perform poorly on VTAB structured datasets. We do not observe any significant difference between the ViT models of the same base model. @@ -221,9 +221,9 @@ Select the best model In general, you can select the best model for your application from different perspectives: disk usage, peak RAM and VRAM usages, and most importantly, the performance. Based on our experiments, we recommend the ViT models over the RN models for most general applications. -More specifically, the ViT-H-14::laion2b_s32b_b79k model and ViT-g-14::laion2b_s12b_b42k model should be first considered since they have the best or close to the best performance in most cases. +More specifically, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model should be first considered since they have the best or close to the best performance in most cases. However, you should choose the model that best fits your requirements. -For example, if you are labelling images for Diabetic Retinopathy, you should probably select the ViT-B-32::laion2b_s34b_b79k model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. +For example, if you are labelling images for Diabetic Retinopathy, you should probably select the ``ViT-B-32::laion2b_s34b_b79k`` model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. Appendix: Datasets description ------------------------------ From ccaaec1ea5d962fd09ffdfd124901b78e105b268 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sat, 8 Oct 2022 23:01:05 +0800 Subject: [PATCH 29/34] docs: footnote --- docs/user-guides/benchmark.rst | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 98114a779..8c39fa225 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -228,17 +228,17 @@ For example, if you are labelling images for Diabetic Retinopathy, you should pr Appendix: Datasets description ------------------------------ -* **COCO Caption**: The dataset contains over one and a half million captions describing over 330,000 images. For the training and validation images, five independent human generated captions are provided. +* **COCO Caption** [1]_: The dataset contains over one and a half million captions describing over 330,000 images. For the training and validation images, five independent human generated captions are provided. -* **Flickr 8k**: The dataset consists of 8,000 images that are each paired with five different captions which provide clear descriptions of the salient entities and events. The images were chosen from six different Flickr groups, and tend not to contain any well-known people or locations, but were manually selected to depict a variety of scenes and situations. +* **Flickr 8k** [2]_: The dataset consists of 8,000 images that are each paired with five different captions which provide clear descriptions of the salient entities and events. The images were chosen from six different Flickr groups, and tend not to contain any well-known people or locations, but were manually selected to depict a variety of scenes and situations. -* **Flickr 30k**: The dataset is an extension of the Flickr 8k Dataset. It consists of 158,915 crowd-sourced captions describing 31,783 images. +* **Flickr 30k** [3]_: The dataset is an extension of the Flickr 8k Dataset. It consists of 158,915 crowd-sourced captions describing 31,783 images. -* **ImageNetV2**: ImageNetV2 contains three test sets with 10,000 new images each. Importantly, these test sets were sampled after a decade of progress on the original ImageNet dataset. This makes the new test data independent of existing models and guarantees that the accuracy scores are not affected by adaptive overfitting. +* **ImageNetV2** [4]_: ImageNetV2 contains three test sets with 10,000 new images each. Importantly, these test sets were sampled after a decade of progress on the original ImageNet dataset. This makes the new test data independent of existing models and guarantees that the accuracy scores are not affected by adaptive overfitting. -* **VOC2007**: The training data provided consists of a set of images; each image has an annotation file giving a bounding box and object class label for each object in one of the twenty classes present in the image. Note that multiple objects from multiple classes may be present in the same image. +* **VOC2007** [5]_: The training data provided consists of a set of images; each image has an annotation file giving a bounding box and object class label for each object in one of the twenty classes present in the image. Note that multiple objects from multiple classes may be present in the same image. -* **VTAB natural group**: The natural group represents classical vision problems. These tasks contain natural images captured using standard cameras. The classes may represent generic, fine-grained, or abstract objects. +* **VTAB natural group** [6]_: The natural group represents classical vision problems. These tasks contain natural images captured using standard cameras. The classes may represent generic, fine-grained, or abstract objects. * **Caltech101**: The task consists in classifying pictures of objects (101 classes plus a background clutter class), including animals, airplanes, chairs, or scissors. The image size varies, but it typically ranges from 200-300 pixels per edge. @@ -281,3 +281,10 @@ Appendix: Datasets description * **DMLab**: The DMLab (DeepMind Lab) is a set of control environments focused on 3D navigation and puzzle-solving tasks. The Dmlab dataset contains frames observed by the agent acting in the DeepMind Lab environment, which are annotated by the distance between the agent and various objects present in the environment. The goal is to evaluate the ability of a visual model to reason about distances from the visual input in 3D environments. The Dmlab dataset consists of 360x480 color images in 6 classes. The classes are {close, far, very far} x {positive reward, negative reward} respectively. * **KITTI-Dist**: The KITTI task consists in predicting the (binned) depth to the vehicle (car, van, or truck) in the image. There are 4 bins / classes. + +.. [1] https://arxiv.org/pdf/1504.00325.pdf +.. [2] https://www.kaggle.com/datasets/adityajn105/flickr8k +.. [3] https://shannon.cs.illinois.edu/DenotationGraph/ +.. [4] https://github.com/modestyachts/ImageNetV2 +.. [5] http://host.robots.ox.ac.uk/pascal/VOC/voc2007/ +.. [6] https://arxiv.org/pdf/1910.04867.pdf From 8e398b6e73e6d2f6db22ea48bd7b056a4ea5c9f5 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sun, 9 Oct 2022 14:28:26 +0800 Subject: [PATCH 30/34] docs: add QPS --- docs/user-guides/benchmark.rst | 131 +++++++++++++++++---------------- 1 file changed, 67 insertions(+), 64 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 8c39fa225..9c2e95638 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -10,69 +10,71 @@ They can be used as a guide to choose the best model for your application. Basic statistics ---------------- -In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with :code:`batch_size=8` using PyTorch runtime. - -+----------------------------------------+------------------+----------------------+-----------------------+ -| Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | -+========================================+==================+======================+=======================+ -| RN50::openai | **244** | 2.99 | **1.36** | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN50::yfcc15m | 389 | 2.86 | **1.36** | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN50::cc12m | 389 | **2.84** | **1.36** | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN101::openai | 278 | 3.05 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN101::yfcc15m | 457 | 2.88 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN50x4::openai | 402 | 3.23 | 1.63 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN50x16::openai | 631 | 3.63 | 2.02 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| RN50x64::openai | 1291 | 4.08 | 2.98 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-32::openai | 338 | 3.20 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-16::openai | 335 | 3.20 | 1.44 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-L-14::openai | 890 | 3.66 | 2.04 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-L-14-336::openai | 891 | 3.74 | 2.23 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | -+----------------------------------------+------------------+----------------------+-----------------------+ -| M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | -+----------------------------------------+------------------+----------------------+-----------------------+ +In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with ``batch_size=8`` using PyTorch runtime. +We also include the QPS (Queries Per Second) for the text and image encoding tasks using ``clip_client`` with PyTorch runtime. + ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | Text QPS | Image QPS | ++========================================+==================+======================+=======================+===========+============+ +| RN50::openai | **244** | 2.99 | **1.36** | 1019 | 269 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN50::yfcc15m | 389 | 2.86 | **1.36** | 1083 | 262 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN50::cc12m | 389 | **2.84** | **1.36** | 1064 | 264 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN101::openai | 278 | 3.05 | 1.40 | 1047 | 222 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN101::yfcc15m | 457 | 2.88 | 1.40 | 1107 | 223 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN50x4::openai | 402 | 3.23 | 1.63 | 1047 | 218 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN50x16::openai | 631 | 3.63 | 2.02 | 1038 | 121 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| RN50x64::openai | 1291 | 4.08 | 2.98 | 985 | 59 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-32::openai | 338 | 3.20 | 1.40 | 1064 | 286 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | **1120** | **292** | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | 1080 | 287 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-32::laion400m_e32 | 577 | 2.94 | 1.40 | 1092 | 289 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-32::laion2b-s34b-b79k | 577 | 2.94 | 1.40 | 1102 | 285 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-16::openai | 335 | 3.20 | 1.44 | 1064 | 260 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-16::laion400m_e31 | 571 | 2.93 | 1.44 | 1099 | 262 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-16::laion400m_e32 | 571 | 2.94 | 1.44 | 1082 | 268 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-16-plus-240::laion400m_e31 | 795 | 3.03 | 1.59 | 1059 | 235 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-B-16-plus-240::laion400m_e32 | 795 | 3.03 | 1.59 | 1043 | 239 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-L-14::openai | 890 | 3.66 | 2.04 | 1040 | 140 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-L-14::laion400m_e31 | 1631 | 3.43 | 2.03 | 1058 | 147 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-L-14::laion400m_e32 | 1631 | 3.42 | 2.03 | 1061 | 146 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-L-14::laion2b-s32b-b82k | 1631 | 3.43 | 2.03 | 1069 | 147 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-L-14-336::openai | 891 | 3.74 | 2.23 | 1070 | 76 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-H-14::laion2b-s32b-b79k | 3762 | 4.45 | 3.26 | 642 | 91 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| ViT-g-14::laion2b-s12b-b42k | 5214 | 5.16 | 4.00 | 639 | 69 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| M-CLIP/LABSE-Vit-L-14 | 3609 | 4.30 | 4.70 | 646 | 284 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| M-CLIP/XLM-Roberta-Large-Vit-B-32 | 4284 | 5.37 | 1.68 | 656 | 139 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | 4293 | 4.30 | 4.13 | 662 | 236 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ +| M-CLIP/XLM-Roberta-Large-Vit-L-14 | 4293 | 4.30 | 4.97 | 1027 | 139 | ++----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ + CLIP benchmark @@ -219,7 +221,8 @@ We do not observe any significant difference between the ViT models of the same Select the best model +++++++++++++++++++++ -In general, you can select the best model for your application from different perspectives: disk usage, peak RAM and VRAM usages, and most importantly, the performance. +In general, you can select the best model for your application from different perspectives: disk usage, peak RAM and VRAM usages, QPS, and most importantly, the performance. + Based on our experiments, we recommend the ViT models over the RN models for most general applications. More specifically, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model should be first considered since they have the best or close to the best performance in most cases. However, you should choose the model that best fits your requirements. From c1f06e53675af645352d15d318fd3b4b40d8e0e9 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sun, 9 Oct 2022 14:36:04 +0800 Subject: [PATCH 31/34] docs: improve conclusion --- docs/user-guides/benchmark.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 9c2e95638..bc01257f4 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -225,7 +225,8 @@ In general, you can select the best model for your application from different pe Based on our experiments, we recommend the ViT models over the RN models for most general applications. More specifically, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model should be first considered since they have the best or close to the best performance in most cases. -However, you should choose the model that best fits your requirements. +However, if you are concerned about the encoding speed, you can consider other ViT models because they have higher QPS with decent performance. +Anyway, you should choose the model that best fits your requirements. For example, if you are labelling images for Diabetic Retinopathy, you should probably select the ``ViT-B-32::laion2b_s34b_b79k`` model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. Appendix: Datasets description From 16689a3a0ea5a3205f8c21c57ef296ac5d0254a2 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sun, 9 Oct 2022 22:57:05 +0800 Subject: [PATCH 32/34] docs: update machine config --- docs/user-guides/benchmark.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index bc01257f4..a41fd858a 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -10,7 +10,7 @@ They can be used as a guide to choose the best model for your application. Basic statistics ---------------- -In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) for a series of text and image encoding tasks with ``batch_size=8`` using PyTorch runtime. +In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) with a Intel® Core™ i7-10700K Processor (128GB RAM) for a series of text and image encoding tasks with ``batch_size=8`` using PyTorch runtime. We also include the QPS (Queries Per Second) for the text and image encoding tasks using ``clip_client`` with PyTorch runtime. +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ @@ -227,7 +227,8 @@ Based on our experiments, we recommend the ViT models over the RN models for mos More specifically, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``ViT-g-14::laion2b_s12b_b42k`` model should be first considered since they have the best or close to the best performance in most cases. However, if you are concerned about the encoding speed, you can consider other ViT models because they have higher QPS with decent performance. Anyway, you should choose the model that best fits your requirements. -For example, if you are labelling images for Diabetic Retinopathy, you should probably select the ``ViT-B-32::laion2b_s34b_b79k`` model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. +For example, if you are labeling images for diabetic retinopathy, you should probably select the ``ViT-B-32::laion2b_s34b_b79k`` model since it has the best top-1 accuracy of 0.734 on zero-shot classification of the Retinopathy dataset. +Or if you are dealing with histopathologic images, you should probably select the RN50::openai model since it has the best top-1 accuracy of 0.636 on zero-shot classification of the Patch Camelyon dataset. Appendix: Datasets description ------------------------------ From cc0e98c1bb132d88694d6aaee36bf4b225b373c7 Mon Sep 17 00:00:00 2001 From: ZiniuYu Date: Sun, 9 Oct 2022 23:02:47 +0800 Subject: [PATCH 33/34] docs: update software version --- docs/user-guides/benchmark.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index a41fd858a..60bae37ff 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -11,6 +11,7 @@ Basic statistics ---------------- In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) with a Intel® Core™ i7-10700K Processor (128GB RAM) for a series of text and image encoding tasks with ``batch_size=8`` using PyTorch runtime. +We use ``clip_client==3.7.0``, ``clip_server==3.7.0``, ``jina==3.10.1`` and ``docarry==0.17.0``. We also include the QPS (Queries Per Second) for the text and image encoding tasks using ``clip_client`` with PyTorch runtime. +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ From 983945125aace907242dea8d05e630ac1dc6967f Mon Sep 17 00:00:00 2001 From: felix-wang <35718120+numb3r3@users.noreply.github.com> Date: Mon, 10 Oct 2022 14:06:23 +0800 Subject: [PATCH 34/34] chore: polish benchmark doc (#839) * chore: update benchmark intro * chore: minor revision * chore: minor revision * chore: minor revision * chore: minor revision * chore: minor revision * chore: minor revision --- docs/index.md | 2 +- docs/user-guides/benchmark.rst | 43 ++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/docs/index.md b/docs/index.md index f243fc404..9415971a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -177,8 +177,8 @@ It means the client and the server are now connected. Well done! user-guides/client user-guides/server -user-guides/retriever user-guides/benchmark +user-guides/retriever user-guides/faq ``` diff --git a/docs/user-guides/benchmark.rst b/docs/user-guides/benchmark.rst index 60bae37ff..ee1e58383 100644 --- a/docs/user-guides/benchmark.rst +++ b/docs/user-guides/benchmark.rst @@ -1,27 +1,27 @@ -CLIP Benchmark -============== +Benchmark +========= -In order to evaluate the performance of different CLIP models, we conducted a benchmark on a series of tasks using different datasets. -We fork the `CLIP benchmark repo `_, slightly modify the codebase and apply it to all Vision Transformers (ViT) and ResNet (RN) CLIP models. -You can find the benchmark results in the following tables. -The best results are highlighted in bold. -They can be used as a guide to choose the best model for your application. +In order to understand the zero-shot performance of CLIP and its limitations, we conducted a benchmark +across a variety of computer vision datasets (the dataset details are in the appendix). Here, thanks for the +open-source `CLIP Benchmark toolkit `_, we can easily reproduce the results. -Basic statistics ----------------- +We hope that this benchmark can help you to better understand the performance of CLIP models and choose the best model for your application. -In the table below, we include the disk usage (in delta) and the peak RAM and VRAM usage (in delta) when running on a single Nvidia TITAN RTX GPU (24GB VRAM) with a Intel® Core™ i7-10700K Processor (128GB RAM) for a series of text and image encoding tasks with ``batch_size=8`` using PyTorch runtime. -We use ``clip_client==3.7.0``, ``clip_server==3.7.0``, ``jina==3.10.1`` and ``docarry==0.17.0``. -We also include the QPS (Queries Per Second) for the text and image encoding tasks using ``clip_client`` with PyTorch runtime. + +Size and efficiency +------------------------- + +We first present the model's size and efficiency in terms of query time and memory usage (including the peak RAM and VRAM usage). +All of the results are obtained on a single Nvidia TITAN RTX GPU (24GB VRAM) with default server settings. +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ | Model | Disk Usage (MB) | Peak RAM Usage (GB) | Peak VRAM Usage (GB) | Text QPS | Image QPS | +========================================+==================+======================+=======================+===========+============+ -| RN50::openai | **244** | 2.99 | **1.36** | 1019 | 269 | +| RN50::openai | 244 | 2.99 | 1.36 | 1019 | 269 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ -| RN50::yfcc15m | 389 | 2.86 | **1.36** | 1083 | 262 | +| RN50::yfcc15m | 389 | 2.86 | 1.36 | 1083 | 262 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ -| RN50::cc12m | 389 | **2.84** | **1.36** | 1064 | 264 | +| RN50::cc12m | 389 | 2.84 | 1.36 | 1064 | 264 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ | RN101::openai | 278 | 3.05 | 1.40 | 1047 | 222 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ @@ -35,7 +35,7 @@ We also include the QPS (Queries Per Second) for the text and image encoding tas +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ | ViT-B-32::openai | 338 | 3.20 | 1.40 | 1064 | 286 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ -| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | **1120** | **292** | +| ViT-B-32::laion2b_e16 | 577 | 2.93 | 1.40 | 1120 | 292 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ | ViT-B-32::laion400m_e31 | 577 | 2.93 | 1.40 | 1080 | 287 | +----------------------------------------+------------------+----------------------+-----------------------+-----------+------------+ @@ -78,8 +78,11 @@ We also include the QPS (Queries Per Second) for the text and image encoding tas -CLIP benchmark --------------- +Zero-shot performance +---------------------------- + +In this section, we will report the zero-shot performance of the models on classification and retrieval tasks across different datasets. +In the following tables, we will highlight the best results in bold for each dataset (higher is better). Zero-shot retrieval +++++++++++++++++++ @@ -219,8 +222,8 @@ Similar to retrieval results, the ``ViT-H-14::laion2b_s32b_b79k`` model and ``Vi All models tend to perform well on ImageNetV2, VOC2007, VTAB natural and VTAB specialized (except for Retinopathy) datasets, whereas they perform poorly on VTAB structured datasets. We do not observe any significant difference between the ViT models of the same base model. -Select the best model -+++++++++++++++++++++ +Select the right model +----------------------- In general, you can select the best model for your application from different perspectives: disk usage, peak RAM and VRAM usages, QPS, and most importantly, the performance.