From ffea5fc6c8d1ff32db289663709e550ed15e7ecf Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Sun, 21 Mar 2021 18:46:39 +0100
Subject: [PATCH 01/13] add bleu metric - refactor rouge and add nlp module

---
 ignite/metrics/nlp/__init__.py               |   0
 ignite/metrics/nlp/bleu.py                   | 186 +++++++++++++++
 ignite/metrics/{ => nlp}/rouge.py            |   0
 ignite/metrics/nlp/utils.py                  |   0
 tests/ignite/metrics/nlp/__init__.py         |   0
 tests/ignite/metrics/nlp/test_bleu.py        | 239 +++++++++++++++++++
 tests/ignite/metrics/{ => nlp}/test_rouge.py |   2 +-
 tests/ignite/metrics/nlp/test_utils.py       |   0
 8 files changed, 426 insertions(+), 1 deletion(-)
 create mode 100644 ignite/metrics/nlp/__init__.py
 create mode 100644 ignite/metrics/nlp/bleu.py
 rename ignite/metrics/{ => nlp}/rouge.py (100%)
 create mode 100644 ignite/metrics/nlp/utils.py
 create mode 100644 tests/ignite/metrics/nlp/__init__.py
 create mode 100644 tests/ignite/metrics/nlp/test_bleu.py
 rename tests/ignite/metrics/{ => nlp}/test_rouge.py (99%)
 create mode 100644 tests/ignite/metrics/nlp/test_utils.py

diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
new file mode 100644
index 00000000000..d36e68bda65
--- /dev/null
+++ b/ignite/metrics/nlp/bleu.py
@@ -0,0 +1,186 @@
+import math
+from collections import Counter
+from typing import Callable, Sequence, Optional, Tuple, Union, Any
+
+import torch
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
+
+from ignite.metrics.nlp.utils import modified_precision
+
+__all__ = ["Bleu"]
+
+
+def _closest_ref_length(references: Sequence[Sequence[Any]], hyp_len: int) -> int:
+    ref_lens = (len(reference) for reference in references)
+    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len))
+    return closest_ref_len
+
+
+class _Smoother:
+    """
+    Smoothing helper
+    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+    """
+    def __init__(self, method: str):
+        valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"]
+        if method not in valid:
+            raise ValueError(f"Smooth is not valid (expected: {valid}, got: {method})")
+        self.smooth = method
+
+    def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float]:
+        method = getattr(self, self.smooth)
+        return method(numerators, denominators)
+
+    @staticmethod
+    def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        epsilon = 0.1
+        denominators = [max(1, d) for d in denominators.values()]
+        return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators)]
+
+    @staticmethod
+    def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        denominators = [max(1, d) for d in denominators.values()]
+        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators)]
+
+    @staticmethod
+    def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators.values())]
+
+    @staticmethod
+    def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        denominators = [max(1, d) for d in denominators.values()]
+        return [n / d for n, d in zip(numerators.values(), denominators)]
+
+
+class Bleu(Metric):
+    r"""Calculates the `BLEU score <https://en.wikipedia.org/wiki/BLEU>`_.
+
+    .. math::
+       \text{BLEU} = \text{BP} \dot exp \left( \sum_{n=1}^{N} w_{n} log p_{n} \right)
+
+    where :math:`N` is the order of n-grams, :math:`\text{BP}` is a sentence brevety penalty, :math:`w_{n}` are
+    positive weights summing to one and :math:`p_{n}` are modified n-gram precisions.
+
+    More details can be found in `Papineni et al. 2002`__.
+
+    __ https://www.aclweb.org/anthology/P02-1040.pdf
+
+    In addition, a review of smoothing techniques can be found in `Chen et al. 2014`__
+
+    __ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+
+    Remark :
+
+        This implementation is inspired by nltk
+
+    Args:
+        ngram: order of n-grams.
+        smooth: enable smoothing. Valid are "no_smooth", "smooth1", "nltk_smooth2" or "smooth2". (Default: "no_smooth")
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+
+    Example:
+
+    .. code-block:: python
+
+        from ignite.metrics import Bleu
+        m = Bleu(ngram=4, smooth="smooth1")
+        y_pred = "the the the the the the the"
+        y = ["the cat is on the mat", "there is a cat on the mat"]
+        m.update((y_pred.split(), [y.split()]))
+        print(m.compute())
+
+    .. versionadded:: 0.5.0
+    """
+
+    def __init__(
+        self,
+        ngram: int = 4,
+        smooth: str = "no_smooth",
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+    ):
+        if ngram <= 0:
+            raise ValueError(f"ngram order must be greater than zero (got: {ngram})")
+        self.ngrams_order = ngram
+        self.weights = [1 / self.ngrams_order] * self.ngrams_order
+        self.smoother = _Smoother(method=smooth)
+        super(Bleu, self).__init__(output_transform=output_transform, device=device)
+
+    def corpus_bleu(
+        self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],
+    ):
+        p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
+        p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+
+        assert len(references) == len(candidates), (
+            "The number of hypotheses and their reference(s) should be the same "
+        )
+
+        # Iterate through each hypothesis and their corresponding references.
+        for refs, hyp in zip(references, candidates):
+            # For each order of ngram, calculate the numerator and
+            # denominator for the corpus-level modified precision.
+            for i in range(1, self.ngrams_order + 1):
+                numerator, denominator = modified_precision(refs, hyp, i)
+                p_numerators[i] += numerator
+                p_denominators[i] += denominator
+
+        # Returns 0 if there's no matching n-grams
+        # We only need to check for p_numerators[1] == 0, since if there's
+        # no unigrams, there won't be any higher order ngrams.
+        if p_numerators[1] == 0:
+            return 0
+
+        # If no smoother, returns 0 if there's at least one a not matching n-grams
+        if self.smoother.smooth == "no_smooth" and min(p_numerators.values()) == 0:
+            return 0
+
+        # Calculate the hypothesis lengths
+        hyp_lengths = [len(hyp) for hyp in candidates]
+
+        # Calculate the closest reference lengths.
+        ref_lengths = [_closest_ref_length(refs, hyp_len) for refs, hyp_len in zip(references, hyp_lengths)]
+
+        # Sum of hypothesis and references lengths
+        hyp_len = sum(hyp_lengths)
+        ref_len = sum(ref_lengths)
+
+        # Calculate corpus-level brevity penalty.
+        if hyp_len < ref_len:
+            bp = math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0
+        else:
+            bp = 1.0
+
+        # Smoothing
+        p_n = self.smoother(p_numerators, p_denominators)
+
+        # Compute the geometric mean
+        s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)]
+        s = bp * math.exp(math.fsum(s))
+        return s
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device)
+        self._num_sentences = 0
+
+    @reinit__is_reduced
+    def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None:
+        y_pred, y = output
+        self._sum_of_bleu += self.corpus_bleu(references=[y], candidates=[y_pred])
+        self._num_sentences += 1
+
+    @sync_all_reduce("_sum_of_bleu", "_num_sentences")
+    def compute(self) -> torch.Tensor:
+        if self._num_sentences == 0:
+            raise NotComputableError("Bleu must have at least one example before it can be computed.")
+        return self._sum_of_bleu / self._num_sentences
diff --git a/ignite/metrics/rouge.py b/ignite/metrics/nlp/rouge.py
similarity index 100%
rename from ignite/metrics/rouge.py
rename to ignite/metrics/nlp/rouge.py
diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
new file mode 100644
index 00000000000..21aee7fad3e
--- /dev/null
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -0,0 +1,239 @@
+import os
+import pytest
+import warnings
+
+from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+
+import torch
+
+import ignite.distributed as idist
+from ignite.exceptions import NotComputableError
+from ignite.metrics.nlp import Bleu
+
+from . import CorpusForTest
+
+corpus = CorpusForTest(lower_split=True)
+
+
+def test_wrong_inputs():
+
+    with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
+        Bleu(ngram=0)
+
+    with pytest.raises(ValueError, match=r"Smooth is not valid"):
+        Bleu(smooth="fake")
+
+    with pytest.raises(NotComputableError):
+        Bleu().compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu(candidate, references):
+    print(candidate, references)
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(references, candidate, weights=weights)
+        bleu = Bleu(ngram=i)
+        assert pytest.approx(reference) == bleu.corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert pytest.approx(reference) == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_smooth1(candidate, references):
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(references, candidate,
+                                    weights=weights,
+                                    smoothing_function=SmoothingFunction().method1)
+        bleu = Bleu(ngram=i, smooth="smooth1")
+        assert reference == bleu.corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_nltk_smooth2(candidate, references):
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(references, candidate,
+                                    weights=weights,
+                                    smoothing_function=SmoothingFunction().method2)
+        bleu = Bleu(ngram=i, smooth="nltk_smooth2")
+        assert reference == bleu.corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_smooth2(candidate, references):
+    for i in range(1, 3):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(references, candidate,
+                                    weights=weights,
+                                    smoothing_function=SmoothingFunction().method2)
+        bleu = Bleu(ngram=i, smooth="smooth2")
+        assert reference == bleu.corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+def test_bleu():
+    bleu = Bleu(ngram=4, smooth="smooth2")
+    bleu.update((corpus.cand_1, corpus.references_1))
+    bleu.update((corpus.cand_2a, corpus.references_2))
+    bleu.update((corpus.cand_2b, corpus.references_2))
+    bleu.update((corpus.cand_3, corpus.references_2))
+    value = bleu.corpus_bleu([corpus.references_1], [corpus.cand_1])
+    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2a])
+    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2b])
+    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_3])
+    assert bleu.compute() == value / 4
+
+
+def _test_distrib_integration(device):
+
+    from ignite.engine import Engine
+
+    rank = idist.get_rank()
+
+    size = len(corpus.chunks)
+
+    data = []
+    for c in corpus.chunks:
+        data += idist.get_world_size() * [c]
+
+    def update(_, i):
+        return data[i + size * rank]
+
+    def _test(metric_device):
+        engine = Engine(update)
+        m = Bleu(ngram=4, smooth="smooth2")
+        m.attach(engine, "bleu")
+
+        engine.run(data=list(range(size)), max_epochs=1)
+
+        assert "bleu" in engine.state.metrics
+
+        ref_bleu = 0
+        for candidate, references in data:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                ref_bleu += corpus_bleu([references], [candidate],
+                                        weights=[0.25, 0.25, 0.25, 0.25],
+                                        smoothing_function=SmoothingFunction().method2)
+
+        assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data)
+
+    _test("cpu")
+
+    if device.type != "xla":
+        _test(idist.device())
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
+    device = torch.device(f"cuda:{local_rank}")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+def test_distrib_cpu(distributed_context_single_node_gloo):
+    device = torch.device("cpu")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
+@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
+def test_distrib_hvd(gloo_hvd_executor):
+
+    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+    nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
+
+    gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+
+
+@pytest.mark.multinode_distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
+def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
+    device = torch.device("cpu")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.multinode_distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
+def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
+    device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
+@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_distrib_single_device_xla():
+    device = idist.device()
+    _test_distrib_integration(device)
+
+
+def _test_distrib_xla_nprocs(index):
+    device = idist.device()
+    _test_distrib_integration(device)
+
+
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
+@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_distrib_xla_nprocs(xmp_executor):
+    n = int(os.environ["NUM_TPU_WORKERS"])
+    xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n)
+
diff --git a/tests/ignite/metrics/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
similarity index 99%
rename from tests/ignite/metrics/test_rouge.py
rename to tests/ignite/metrics/nlp/test_rouge.py
index b8c5bc6e6f3..b83c97afaf1 100644
--- a/tests/ignite/metrics/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -8,7 +8,7 @@
 import ignite.distributed as idist
 from ignite.exceptions import NotComputableError
 from ignite.metrics import Rouge
-from ignite.metrics.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams
+from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams
 
 nltk.download("punkt")
 
diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py
new file mode 100644
index 00000000000..e69de29bb2d

From fd0df398e588b202ef128fd8592348ddfc8f91e0 Mon Sep 17 00:00:00 2001
From: Sylvain Desroziers <sylvain.desroziers@gmail.com>
Date: Sun, 21 Mar 2021 20:06:24 +0100
Subject: [PATCH 02/13] Remove blank line

---
 tests/ignite/metrics/nlp/test_bleu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 21aee7fad3e..36f998b419f 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -236,4 +236,3 @@ def _test_distrib_xla_nprocs(index):
 def test_distrib_xla_nprocs(xmp_executor):
     n = int(os.environ["NUM_TPU_WORKERS"])
     xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n)
-

From 30c667ad161e6e7fd73d5a1022a75df4bdbe43e3 Mon Sep 17 00:00:00 2001
From: sdesrozis <sdesrozis@users.noreply.github.com>
Date: Sun, 21 Mar 2021 19:07:14 +0000
Subject: [PATCH 03/13] autopep8 fix

---
 ignite/metrics/nlp/bleu.py            |  8 +++----
 tests/ignite/metrics/nlp/test_bleu.py | 32 ++++++++++++++-------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index d36e68bda65..b0706886778 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -1,12 +1,11 @@
 import math
 from collections import Counter
-from typing import Callable, Sequence, Optional, Tuple, Union, Any
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
 
 import torch
 
 from ignite.exceptions import NotComputableError
 from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
-
 from ignite.metrics.nlp.utils import modified_precision
 
 __all__ = ["Bleu"]
@@ -23,6 +22,7 @@ class _Smoother:
     Smoothing helper
     http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
     """
+
     def __init__(self, method: str):
         valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"]
         if method not in valid:
@@ -121,9 +121,7 @@ def corpus_bleu(
         p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
         p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
 
-        assert len(references) == len(candidates), (
-            "The number of hypotheses and their reference(s) should be the same "
-        )
+        assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same "
 
         # Iterate through each hypothesis and their corresponding references.
         for refs, hyp in zip(references, candidates):
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 36f998b419f..57171d60bbe 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -1,10 +1,9 @@
 import os
-import pytest
 import warnings
 
-from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
-
+import pytest
 import torch
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
 
 import ignite.distributed as idist
 from ignite.exceptions import NotComputableError
@@ -65,9 +64,9 @@ def test_corpus_bleu_smooth1(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method1)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1
+            )
         bleu = Bleu(ngram=i, smooth="smooth1")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -89,9 +88,9 @@ def test_corpus_bleu_nltk_smooth2(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method2)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
         bleu = Bleu(ngram=i, smooth="nltk_smooth2")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -113,9 +112,9 @@ def test_corpus_bleu_smooth2(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method2)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
         bleu = Bleu(ngram=i, smooth="smooth2")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -163,9 +162,12 @@ def _test(metric_device):
         for candidate, references in data:
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                ref_bleu += corpus_bleu([references], [candidate],
-                                        weights=[0.25, 0.25, 0.25, 0.25],
-                                        smoothing_function=SmoothingFunction().method2)
+                ref_bleu += corpus_bleu(
+                    [references],
+                    [candidate],
+                    weights=[0.25, 0.25, 0.25, 0.25],
+                    smoothing_function=SmoothingFunction().method2,
+                )
 
         assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data)
 

From c9a0eef9d3a6ebe5f3bd698f9b9e7bdc4940037e Mon Sep 17 00:00:00 2001
From: Sylvain Desroziers <sylvain.desroziers@gmail.com>
Date: Sun, 21 Mar 2021 20:22:18 +0100
Subject: [PATCH 04/13] Remove metrics

---
 ignite/metrics/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
index 47765f677eb..61003635964 100644
--- a/ignite/metrics/__init__.py
+++ b/ignite/metrics/__init__.py
@@ -15,7 +15,6 @@
 from ignite.metrics.psnr import PSNR
 from ignite.metrics.recall import Recall
 from ignite.metrics.root_mean_squared_error import RootMeanSquaredError
-from ignite.metrics.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.running_average import RunningAverage
 from ignite.metrics.ssim import SSIM
 from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy
@@ -43,9 +42,6 @@
     "PSNR",
     "Recall",
     "RootMeanSquaredError",
-    "Rouge",
-    "RougeN",
-    "RougeL",
     "RunningAverage",
     "VariableAccumulation",
     "Frequency",

From 0cb54829936fd10fa0f49fbffb1f29beda396904 Mon Sep 17 00:00:00 2001
From: Sylvain Desroziers <sylvain.desroziers@gmail.com>
Date: Sun, 21 Mar 2021 20:26:45 +0100
Subject: [PATCH 05/13] Add nlp metrics in nlp.__init__

---
 ignite/metrics/nlp/__init__.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
index e69de29bb2d..cc004b592f9 100644
--- a/ignite/metrics/nlp/__init__.py
+++ b/ignite/metrics/nlp/__init__.py
@@ -0,0 +1,9 @@
+from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.rouge import RougeN, RougeL, Rouge
+
+__all__ = [
+    "Bleu",
+    "Rouge",
+    "RougeL",
+    "RougeN",
+]

From 59db6c0a2025c69ed09461feebe2449d4be1f01b Mon Sep 17 00:00:00 2001
From: sdesrozis <sdesrozis@users.noreply.github.com>
Date: Sun, 21 Mar 2021 19:27:29 +0000
Subject: [PATCH 06/13] autopep8 fix

---
 ignite/metrics/nlp/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
index cc004b592f9..20120ea7c0a 100644
--- a/ignite/metrics/nlp/__init__.py
+++ b/ignite/metrics/nlp/__init__.py
@@ -1,5 +1,5 @@
 from ignite.metrics.nlp.bleu import Bleu
-from ignite.metrics.nlp.rouge import RougeN, RougeL, Rouge
+from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 
 __all__ = [
     "Bleu",

From 7aaa78910a50d57c9c449f0ea04e52d67e4ac3cc Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Mon, 22 Mar 2021 08:39:00 +0100
Subject: [PATCH 07/13] update

---
 ignite/metrics/__init__.py             |  4 --
 ignite/metrics/nlp/__init__.py         |  9 +++
 ignite/metrics/nlp/bleu.py             | 32 ++++-----
 ignite/metrics/nlp/rouge.py            | 52 ++-------------
 ignite/metrics/nlp/utils.py            | 89 ++++++++++++++++++++++++++
 tests/ignite/metrics/nlp/__init__.py   | 61 ++++++++++++++++++
 tests/ignite/metrics/nlp/test_bleu.py  | 33 +++++-----
 tests/ignite/metrics/nlp/test_rouge.py | 81 +++--------------------
 tests/ignite/metrics/nlp/test_utils.py | 57 +++++++++++++++++
 9 files changed, 260 insertions(+), 158 deletions(-)

diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
index 47765f677eb..61003635964 100644
--- a/ignite/metrics/__init__.py
+++ b/ignite/metrics/__init__.py
@@ -15,7 +15,6 @@
 from ignite.metrics.psnr import PSNR
 from ignite.metrics.recall import Recall
 from ignite.metrics.root_mean_squared_error import RootMeanSquaredError
-from ignite.metrics.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.running_average import RunningAverage
 from ignite.metrics.ssim import SSIM
 from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy
@@ -43,9 +42,6 @@
     "PSNR",
     "Recall",
     "RootMeanSquaredError",
-    "Rouge",
-    "RougeN",
-    "RougeL",
     "RunningAverage",
     "VariableAccumulation",
     "Frequency",
diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
index e69de29bb2d..506f0bab51e 100644
--- a/ignite/metrics/nlp/__init__.py
+++ b/ignite/metrics/nlp/__init__.py
@@ -0,0 +1,9 @@
+from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
+
+__all__ = [
+    "Bleu",
+    "Rouge",
+    "RougeN",
+    "RougeL",
+]
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index d36e68bda65..ff7f61607dd 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -1,12 +1,11 @@
 import math
 from collections import Counter
-from typing import Callable, Sequence, Optional, Tuple, Union, Any
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
 
 import torch
 
 from ignite.exceptions import NotComputableError
 from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
-
 from ignite.metrics.nlp.utils import modified_precision
 
 __all__ = ["Bleu"]
@@ -23,6 +22,7 @@ class _Smoother:
     Smoothing helper
     http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
     """
+
     def __init__(self, method: str):
         valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"]
         if method not in valid:
@@ -36,13 +36,13 @@ def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float
     @staticmethod
     def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]:
         epsilon = 0.1
-        denominators = [max(1, d) for d in denominators.values()]
-        return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators)]
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators_)]
 
     @staticmethod
     def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
-        denominators = [max(1, d) for d in denominators.values()]
-        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators)]
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators_)]
 
     @staticmethod
     def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
@@ -50,8 +50,8 @@ def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
 
     @staticmethod
     def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]:
-        denominators = [max(1, d) for d in denominators.values()]
-        return [n / d for n, d in zip(numerators.values(), denominators)]
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [n / d for n, d in zip(numerators.values(), denominators_)]
 
 
 class Bleu(Metric):
@@ -115,15 +115,11 @@ def __init__(
         self.smoother = _Smoother(method=smooth)
         super(Bleu, self).__init__(output_transform=output_transform, device=device)
 
-    def corpus_bleu(
-        self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],
-    ):
-        p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
-        p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+    def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float:
+        p_numerators: Counter = Counter()  # Key = ngram order, and value = no. of ngram matches.
+        p_denominators: Counter = Counter()  # Key = ngram order, and value = no. of ngram in ref.
 
-        assert len(references) == len(candidates), (
-            "The number of hypotheses and their reference(s) should be the same "
-        )
+        assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same "
 
         # Iterate through each hypothesis and their corresponding references.
         for refs, hyp in zip(references, candidates):
@@ -165,8 +161,8 @@ def corpus_bleu(
 
         # Compute the geometric mean
         s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)]
-        s = bp * math.exp(math.fsum(s))
-        return s
+        gm = bp * math.exp(math.fsum(s))
+        return gm
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py
index 8c012a60596..18f264fe09d 100644
--- a/ignite/metrics/nlp/rouge.py
+++ b/ignite/metrics/nlp/rouge.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta, abstractmethod
-from collections import Counter, namedtuple
+from collections import namedtuple
 from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -9,53 +9,9 @@
 
 # These decorators helps with distributed settings
 from ignite.metrics.metric import reinit__is_reduced, sync_all_reduce
+from ignite.metrics.nlp.utils import lcs, ngrams
 
-
-def ngrams(sequence: Sequence[Any], n: int) -> Counter:
-    """
-    Generate the ngrams from a sequence of items
-
-    Args:
-        sequence: sequence of items
-        n: ngram order
-
-    Returns:
-        A counter of ngram objects
-
-    .. versionadded:: 0.5.0
-    """
-    return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)])
-
-
-def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
-    """
-    Compute the length of the longest common subsequence in two sequence of items
-    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
-
-    Args:
-        seq_a: first sequence of items
-        seq_b: second sequence of items
-
-    Returns:
-        The length of the longest common subsequence
-
-    .. versionadded:: 0.5.0
-    """
-    m = len(seq_a)
-    n = len(seq_b)
-
-    dp = [[0] * (n + 1) for _ in range(m + 1)]
-
-    for i in range(m + 1):
-        for j in range(n + 1):
-            if i == 0 or j == 0:
-                dp[i][j] = 0
-            elif seq_a[i - 1] == seq_b[j - 1]:
-                dp[i][j] = dp[i - 1][j - 1] + 1
-            else:
-                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
-
-    return dp[m][n]
+__all__ = ["Rouge", "RougeN", "RougeL"]
 
 
 class Score(namedtuple("Score", ["match", "candidate", "reference"])):
@@ -289,7 +245,7 @@ def __init__(
         super(RougeN, self).__init__(multiref=multiref, alpha=alpha, output_transform=output_transform, device=device)
         self._ngram = ngram
         if self._ngram < 1:
-            raise ValueError(f"ngram order must be greater than one (got : {self._ngram})")
+            raise ValueError(f"ngram order must be greater than zero (got : {self._ngram})")
 
     def _compute_score(self, candidate: Sequence[Any], reference: Sequence[Any]) -> Score:
         return compute_ngram_scores(candidate=candidate, reference=reference, n=self._ngram)
diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py
index e69de29bb2d..90be7fbff1e 100644
--- a/ignite/metrics/nlp/utils.py
+++ b/ignite/metrics/nlp/utils.py
@@ -0,0 +1,89 @@
+from collections import Counter
+from typing import Any, Sequence, Tuple
+
+__all__ = ["ngrams", "lcs", "modified_precision"]
+
+
+def ngrams(sequence: Sequence[Any], n: int) -> Counter:
+    """
+    Generate the ngrams from a sequence of items
+
+    Args:
+        sequence: sequence of items
+        n: n-gram order
+
+    Returns:
+        A counter of ngram objects
+
+    .. versionadded:: 0.5.0
+    """
+    return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)])
+
+
+def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
+    """
+    Compute the length of the longest common subsequence in two sequence of items
+    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+
+    Args:
+        seq_a: first sequence of items
+        seq_b: second sequence of items
+
+    Returns:
+        The length of the longest common subsequence
+
+    .. versionadded:: 0.5.0
+    """
+    m = len(seq_a)
+    n = len(seq_b)
+
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = 0
+            elif seq_a[i - 1] == seq_b[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    return dp[m][n]
+
+
+def modified_precision(references: Sequence[Sequence[Any]], candidate: Any, n: int) -> Tuple[int, int]:
+    """
+    Compute the modified precision
+
+    .. math::
+       p_{n} = \frac{m_{n}}{l_{n}}
+
+    where m_{n} is the number of matched n-grams between translation T and its reference R, and l_{n} is the
+    total number of n-grams in the translation T.
+
+    More details can be found in `Papineni et al. 2002`__.
+
+    __ https://www.aclweb.org/anthology/P02-1040.pdf
+
+    Args:
+        references: list of references R
+        candidate: translation T
+        n: n-gram order
+
+    Returns:
+        The length of the longest common subsequence
+
+    .. versionadded:: 0.5.0
+    """
+    # ngrams of the candidate
+    counts = ngrams(candidate, n)
+
+    # union of ngrams of references
+    max_counts: Counter = Counter()
+    for reference in references:
+        max_counts |= ngrams(reference, n)
+
+    # clipped count of the candidate and references
+    clipped_counts = counts & max_counts
+
+    return sum(clipped_counts.values()), sum(counts.values())
diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py
index e69de29bb2d..e12bafd8508 100644
--- a/tests/ignite/metrics/nlp/__init__.py
+++ b/tests/ignite/metrics/nlp/__init__.py
@@ -0,0 +1,61 @@
+__all__ = ["CorpusForTest"]
+
+
+class CorpusForTest:
+    def __init__(self, lower_split=False):
+        def preproc(text):
+            if lower_split:
+                return text.lower().split()
+            else:
+                return text
+
+        # BLEU Paper examples
+        self.cand_1 = preproc("the the the the the the the")
+        self.ref_1a = preproc("The cat is on the mat")
+        self.ref_1b = preproc("There is a cat on the mat")
+
+        self.cand_2a = preproc(
+            "It is a guide to action which ensures that the military always obeys the commands of the party"
+        )
+        self.cand_2b = preproc("It is to insure the troops forever hearing the activity guidebook that " "party direct")
+        self.ref_2a = preproc(
+            "It is a guide to action that ensures that the military will forever heed " "Party commands"
+        )
+        self.ref_2b = preproc(
+            "It is the guiding principle which guarantees the military forces always being under the command of "
+            "the Party"
+        )
+        self.ref_2c = preproc("It is the practical guide for the army always to heed the directions of the party")
+
+        self.cand_3 = preproc("of the")
+
+        self.references_1 = [self.ref_1a, self.ref_1b]
+        self.references_2 = [self.ref_2a, self.ref_2b, self.ref_2c]
+
+        self.sample_1 = ([self.cand_1], [self.references_1])
+        self.sample_2 = ([self.cand_3], [self.references_2])
+        self.sample_3 = ([self.cand_2a], [self.references_2])
+        self.sample_4 = ([self.cand_2b], [self.references_2])
+        self.sample_5 = ([self.cand_2a, self.cand_2b], [self.references_2, self.references_2])
+
+        self.references_3 = [self.ref_2a, self.ref_2b]
+        self.references_4 = [self.ref_2b, self.ref_2c]
+        self.references_5 = [self.ref_2a, self.ref_2c]
+
+        self.chunks = [
+            (self.cand_1, self.references_1),
+            (self.cand_2a, self.references_2),
+            (self.cand_2b, self.references_2),
+            (self.cand_1, [self.ref_1a]),
+            (self.cand_2a, self.references_3),
+            (self.cand_2b, self.references_3),
+            (self.cand_1, [self.ref_1b]),
+            (self.cand_2a, self.references_4),
+            (self.cand_2b, self.references_4),
+            (self.cand_1, self.references_1),
+            (self.cand_2a, self.references_5),
+            (self.cand_2b, self.references_5),
+            (self.cand_1, [self.ref_1a]),
+            (self.cand_2a, [self.ref_2a]),
+            (self.cand_2b, [self.ref_2c]),
+        ]
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 21aee7fad3e..57171d60bbe 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -1,10 +1,9 @@
 import os
-import pytest
 import warnings
 
-from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
-
+import pytest
 import torch
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
 
 import ignite.distributed as idist
 from ignite.exceptions import NotComputableError
@@ -65,9 +64,9 @@ def test_corpus_bleu_smooth1(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method1)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1
+            )
         bleu = Bleu(ngram=i, smooth="smooth1")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -89,9 +88,9 @@ def test_corpus_bleu_nltk_smooth2(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method2)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
         bleu = Bleu(ngram=i, smooth="nltk_smooth2")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -113,9 +112,9 @@ def test_corpus_bleu_smooth2(candidate, references):
         weights = tuple([1 / i] * i)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            reference = corpus_bleu(references, candidate,
-                                    weights=weights,
-                                    smoothing_function=SmoothingFunction().method2)
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
         bleu = Bleu(ngram=i, smooth="smooth2")
         assert reference == bleu.corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
@@ -163,9 +162,12 @@ def _test(metric_device):
         for candidate, references in data:
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                ref_bleu += corpus_bleu([references], [candidate],
-                                        weights=[0.25, 0.25, 0.25, 0.25],
-                                        smoothing_function=SmoothingFunction().method2)
+                ref_bleu += corpus_bleu(
+                    [references],
+                    [candidate],
+                    weights=[0.25, 0.25, 0.25, 0.25],
+                    smoothing_function=SmoothingFunction().method2,
+                )
 
         assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data)
 
@@ -236,4 +238,3 @@ def _test_distrib_xla_nprocs(index):
 def test_distrib_xla_nprocs(xmp_executor):
     n = int(os.environ["NUM_TPU_WORKERS"])
     xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n)
-
diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
index b83c97afaf1..40aafae189c 100644
--- a/tests/ignite/metrics/nlp/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -7,36 +7,14 @@
 
 import ignite.distributed as idist
 from ignite.exceptions import NotComputableError
-from ignite.metrics import Rouge
-from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams
+from ignite.metrics.nlp import Rouge
+from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores
+
+from . import CorpusForTest
 
 nltk.download("punkt")
 
-
-@pytest.mark.parametrize(
-    "sequence, n, expected_keys, expected_values",
-    [
-        ([], 1, [], []),
-        ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]),
-        ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],),
-        ([0, 1, 2], 3, [(0, 1, 2)], [1]),
-        ([0, 0, 0], 1, [(0,)], [3]),
-        ([0, 0, 0], 2, [(0, 0)], [2]),
-        ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]),
-    ],
-)
-def test_ngrams(sequence, n, expected_keys, expected_values):
-    ngrams_counter = ngrams(sequence=sequence, n=n)
-    assert list(ngrams_counter.values()) == expected_values
-    assert list(ngrams_counter.keys()) == expected_keys
-
-
-@pytest.mark.parametrize(
-    "seq_a, seq_b, expected",
-    [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),],
-)
-def test_lcs(seq_a, seq_b, expected):
-    assert lcs(seq_a, seq_b) == expected
+corpus = CorpusForTest()
 
 
 @pytest.mark.parametrize(
@@ -61,7 +39,7 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec
 
 def test_wrong_inputs():
 
-    with pytest.raises(ValueError, match=r"ngram order must be greater than one"):
+    with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         RougeN(ngram=0)
 
     with pytest.raises(ValueError, match=r"alpha must be in interval \[0, 1\]"):
@@ -106,31 +84,8 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected):
         assert results[f"Rouge-{ngram}-F"] == F
 
 
-# BLEU Paper examples
-CAND_1 = "the the the the the the the"
-REF_1A = "The cat is on the mat"
-REF_1B = "There is a cat on the mat"
-
-CAND_2A = "It is a guide to action which ensures that the military always obeys the " "commands of the party"
-CAND_2B = "It is to insure the troops forever hearing the activity guidebook that " "party direct"
-REF_2A = "It is a guide to action that ensures that the military will forever heed " "Party commands"
-REF_2B = (
-    "It is the guiding principle which guarantees the military forces always being under the " "command of the Party"
-)
-REF_2C = "It is the practical guide for the army always to heed the directions of the " "party"
-
-CAND_3 = "of the"
-
-
 @pytest.mark.parametrize(
-    "candidates, references",
-    [
-        ([CAND_1], [[REF_1A, REF_1B]]),
-        ([CAND_3], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2A], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2B], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2A, CAND_2B], [[REF_2A, REF_2B, REF_2C], [REF_2A, REF_2B, REF_2C]]),
-    ],
+    "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5,],
 )
 def test_rouge_metrics(candidates, references):
     for multiref in ["average", "best"]:
@@ -171,28 +126,10 @@ def _test_distrib_integration(device):
 
     rank = idist.get_rank()
 
-    chunks = [
-        (CAND_1, [REF_1A, REF_1B]),
-        (CAND_2A, [REF_2A, REF_2B, REF_2C]),
-        (CAND_2B, [REF_2A, REF_2B, REF_2C]),
-        (CAND_1, [REF_1A]),
-        (CAND_2A, [REF_2A, REF_2B]),
-        (CAND_2B, [REF_2A, REF_2B]),
-        (CAND_1, [REF_1B]),
-        (CAND_2A, [REF_2B, REF_2C]),
-        (CAND_2B, [REF_2B, REF_2C]),
-        (CAND_1, [REF_1A, REF_1B]),
-        (CAND_2A, [REF_2A, REF_2C]),
-        (CAND_2B, [REF_2A, REF_2C]),
-        (CAND_1, [REF_1A]),
-        (CAND_2A, [REF_2A]),
-        (CAND_2B, [REF_2C]),
-    ]
-
-    size = len(chunks)
+    size = len(corpus.chunks)
 
     data = []
-    for c in chunks:
+    for c in corpus.chunks:
         data += idist.get_world_size() * [c]
 
     def update(_, i):
diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py
index e69de29bb2d..8cf267a68bd 100644
--- a/tests/ignite/metrics/nlp/test_utils.py
+++ b/tests/ignite/metrics/nlp/test_utils.py
@@ -0,0 +1,57 @@
+import pytest
+
+from ignite.metrics.nlp.utils import lcs, modified_precision, ngrams
+
+
+@pytest.mark.parametrize(
+    "sequence, n, expected_keys, expected_values",
+    [
+        ([], 1, [], []),
+        ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]),
+        ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],),
+        ([0, 1, 2], 3, [(0, 1, 2)], [1]),
+        ([0, 0, 0], 1, [(0,)], [3]),
+        ([0, 0, 0], 2, [(0, 0)], [2]),
+        ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]),
+    ],
+)
+def test_ngrams(sequence, n, expected_keys, expected_values):
+    ngrams_counter = ngrams(sequence=sequence, n=n)
+    assert list(ngrams_counter.values()) == expected_values
+    assert list(ngrams_counter.keys()) == expected_keys
+
+
+@pytest.mark.parametrize(
+    "seq_a, seq_b, expected",
+    [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),],
+)
+def test_lcs(seq_a, seq_b, expected):
+    assert lcs(seq_a, seq_b) == expected
+
+
+def test_modified_precision_empty():
+    for k in range(1, 5):
+        n, d = modified_precision([[]], [], k)
+        assert n == 0 and d == 0
+        n, d = modified_precision([[]], [0], k)
+        assert n == 0 and d == (k == 1)
+        n, d = modified_precision([[0]], [], k)
+        assert n == 0 and d == 0
+        n, d = modified_precision([[]], list(range(k)), k)
+        assert n == 0 and d == 1
+        n, d = modified_precision([list(range(k))], [], k)
+        assert n == 0 and d == 0
+
+
+@pytest.mark.parametrize(
+    "references, candidate, expected",
+    [
+        ([[0, 0, 0], [1, 2]], [1, 2, 3, 4], ((2, 4), (1, 3), (0, 2))),
+        ([[0, 1, 2], [0, 0, 3]], [0, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))),
+        ([[0, 1, 2], [3, 0, 3]], [3, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))),
+    ],
+)
+def test_modified_precision(references, candidate, expected):
+    for n, (e_n, e_d) in enumerate(expected, start=1):
+        n, d = modified_precision(references, candidate, n)
+        assert n == e_n and d == e_d

From 2a3a81b89e6fd37a946dd240d99085becf7b3920 Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Mon, 22 Mar 2021 08:51:17 +0100
Subject: [PATCH 08/13] expose nlp - replace assert by exception

---
 ignite/metrics/__init__.py            |  6 ++++++
 ignite/metrics/nlp/bleu.py            | 12 ++++++++----
 tests/ignite/metrics/nlp/test_bleu.py |  3 +++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
index 61003635964..d743faf4f62 100644
--- a/ignite/metrics/__init__.py
+++ b/ignite/metrics/__init__.py
@@ -11,6 +11,8 @@
 from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, MetricUsage
 from ignite.metrics.metrics_lambda import MetricsLambda
 from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
+from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.precision import Precision
 from ignite.metrics.psnr import PSNR
 from ignite.metrics.recall import Recall
@@ -46,4 +48,8 @@
     "VariableAccumulation",
     "Frequency",
     "SSIM",
+    "Bleu",
+    "Rouge",
+    "RougeN",
+    "RougeL",
 ]
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index ff7f61607dd..fad3bed41e4 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -116,10 +116,14 @@ def __init__(
         super(Bleu, self).__init__(output_transform=output_transform, device=device)
 
     def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float:
-        p_numerators: Counter = Counter()  # Key = ngram order, and value = no. of ngram matches.
-        p_denominators: Counter = Counter()  # Key = ngram order, and value = no. of ngram in ref.
-
-        assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same "
+        p_numerators: Counter = Counter()
+        p_denominators: Counter = Counter()
+
+        if len(references) != len(candidates):
+            raise ValueError(
+                f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != "
+                f"{len(references)})"
+            )
 
         # Iterate through each hypothesis and their corresponding references.
         for refs, hyp in zip(references, candidates):
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 57171d60bbe..69fd2551fcc 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -22,6 +22,9 @@ def test_wrong_inputs():
     with pytest.raises(ValueError, match=r"Smooth is not valid"):
         Bleu(smooth="fake")
 
+    with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"):
+        Bleu().corpus_bleu(references=[[0], [0]], candidates=[[0]])
+
     with pytest.raises(NotComputableError):
         Bleu().compute()
 

From 2d8af33f7425ffebde0dbbb883a9764607abbdc4 Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Mon, 22 Mar 2021 09:39:08 +0100
Subject: [PATCH 09/13] fix F401

---
 ignite/metrics/nlp/bleu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index fad3bed41e4..0940184360e 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -1,6 +1,6 @@
 import math
 from collections import Counter
-from typing import Any, Callable, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Sequence, Tuple, Union
 
 import torch
 

From ded5f80be6be96d678c048114532c81ed833a97f Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Mon, 22 Mar 2021 16:16:18 +0100
Subject: [PATCH 10/13] fix doc

---
 docs/source/metrics.rst               |  7 ++++---
 ignite/metrics/nlp/bleu.py            | 17 +++++++++++------
 tests/ignite/metrics/nlp/test_bleu.py | 18 +++++++++---------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index eddd6b713fd..3c37f5600b1 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -323,13 +323,14 @@ Complete list of metrics
     precision.Precision
     PSNR
     recall.Recall
-    Rouge
-    rouge.RougeL
-    rouge.RougeN
     RootMeanSquaredError
     RunningAverage
     SSIM
     TopKCategoricalAccuracy
+    Bleu
+    Rouge
+    RougeL
+    RougeN
 
 Helpers for customizing metrics
 -------------------------------
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 0940184360e..6e94bca134d 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -58,9 +58,9 @@ class Bleu(Metric):
     r"""Calculates the `BLEU score <https://en.wikipedia.org/wiki/BLEU>`_.
 
     .. math::
-       \text{BLEU} = \text{BP} \dot exp \left( \sum_{n=1}^{N} w_{n} log p_{n} \right)
+       \text{BLEU} = b_{p} \cdot \exp \left( \sum_{n=1}^{N} w_{n} \: \log p_{n} \right)
 
-    where :math:`N` is the order of n-grams, :math:`\text{BP}` is a sentence brevety penalty, :math:`w_{n}` are
+    where :math:`N` is the order of n-grams, :math:`b_{p}` is a sentence brevety penalty, :math:`w_{n}` are
     positive weights summing to one and :math:`p_{n}` are modified n-gram precisions.
 
     More details can be found in `Papineni et al. 2002`__.
@@ -77,7 +77,8 @@ class Bleu(Metric):
 
     Args:
         ngram: order of n-grams.
-        smooth: enable smoothing. Valid are "no_smooth", "smooth1", "nltk_smooth2" or "smooth2". (Default: "no_smooth")
+        smooth: enable smoothing. Valid are ``no_smooth``, ``smooth1``, ``nltk_smooth2`` or ``smooth2``.
+            Default: ``no_smooth``.
         output_transform: a callable that is used to transform the
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
@@ -91,11 +92,15 @@ class Bleu(Metric):
 
     .. code-block:: python
 
-        from ignite.metrics import Bleu
+        from ignite.metrics.nlp import Bleu
+
         m = Bleu(ngram=4, smooth="smooth1")
+
         y_pred = "the the the the the the the"
         y = ["the cat is on the mat", "there is a cat on the mat"]
+
         m.update((y_pred.split(), [y.split()]))
+
         print(m.compute())
 
     .. versionadded:: 0.5.0
@@ -115,7 +120,7 @@ def __init__(
         self.smoother = _Smoother(method=smooth)
         super(Bleu, self).__init__(output_transform=output_transform, device=device)
 
-    def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float:
+    def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], ) -> float:
         p_numerators: Counter = Counter()
         p_denominators: Counter = Counter()
 
@@ -176,7 +181,7 @@ def reset(self) -> None:
     @reinit__is_reduced
     def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None:
         y_pred, y = output
-        self._sum_of_bleu += self.corpus_bleu(references=[y], candidates=[y_pred])
+        self._sum_of_bleu += self._corpus_bleu(references=[y], candidates=[y_pred])
         self._num_sentences += 1
 
     @sync_all_reduce("_sum_of_bleu", "_num_sentences")
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 69fd2551fcc..45169bacf68 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -23,7 +23,7 @@ def test_wrong_inputs():
         Bleu(smooth="fake")
 
     with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"):
-        Bleu().corpus_bleu(references=[[0], [0]], candidates=[[0]])
+        Bleu()._corpus_bleu(references=[[0], [0]], candidates=[[0]])
 
     with pytest.raises(NotComputableError):
         Bleu().compute()
@@ -47,7 +47,7 @@ def test_corpus_bleu(candidate, references):
             warnings.simplefilter("ignore")
             reference = corpus_bleu(references, candidate, weights=weights)
         bleu = Bleu(ngram=i)
-        assert pytest.approx(reference) == bleu.corpus_bleu(references, candidate)
+        assert pytest.approx(reference) == bleu._corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
         assert pytest.approx(reference) == bleu.compute()
 
@@ -71,7 +71,7 @@ def test_corpus_bleu_smooth1(candidate, references):
                 references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1
             )
         bleu = Bleu(ngram=i, smooth="smooth1")
-        assert reference == bleu.corpus_bleu(references, candidate)
+        assert reference == bleu._corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
         assert reference == bleu.compute()
 
@@ -95,7 +95,7 @@ def test_corpus_bleu_nltk_smooth2(candidate, references):
                 references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
             )
         bleu = Bleu(ngram=i, smooth="nltk_smooth2")
-        assert reference == bleu.corpus_bleu(references, candidate)
+        assert reference == bleu._corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
         assert reference == bleu.compute()
 
@@ -119,7 +119,7 @@ def test_corpus_bleu_smooth2(candidate, references):
                 references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
             )
         bleu = Bleu(ngram=i, smooth="smooth2")
-        assert reference == bleu.corpus_bleu(references, candidate)
+        assert reference == bleu._corpus_bleu(references, candidate)
         bleu.update((candidate[0], references[0]))
         assert reference == bleu.compute()
 
@@ -130,10 +130,10 @@ def test_bleu():
     bleu.update((corpus.cand_2a, corpus.references_2))
     bleu.update((corpus.cand_2b, corpus.references_2))
     bleu.update((corpus.cand_3, corpus.references_2))
-    value = bleu.corpus_bleu([corpus.references_1], [corpus.cand_1])
-    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2a])
-    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2b])
-    value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_3])
+    value = bleu._corpus_bleu([corpus.references_1], [corpus.cand_1])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2a])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2b])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_3])
     assert bleu.compute() == value / 4
 
 

From 219a19f2838e3504479b19ffd6d14d4c5f9c7c1c Mon Sep 17 00:00:00 2001
From: sdesrozis <sdesrozis@users.noreply.github.com>
Date: Mon, 22 Mar 2021 15:18:13 +0000
Subject: [PATCH 11/13] autopep8 fix

---
 ignite/metrics/nlp/bleu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 6e94bca134d..90be793d9fd 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -120,7 +120,7 @@ def __init__(
         self.smoother = _Smoother(method=smooth)
         super(Bleu, self).__init__(output_transform=output_transform, device=device)
 
-    def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], ) -> float:
+    def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float:
         p_numerators: Counter = Counter()
         p_denominators: Counter = Counter()
 

From 1bf1e9c97dffe58784bcddbac9c0cd9f807b7dbc Mon Sep 17 00:00:00 2001
From: Desroziers <sylvain.desroziers@ifpen.fr>
Date: Mon, 22 Mar 2021 16:42:01 +0100
Subject: [PATCH 12/13] add test

---
 tests/ignite/metrics/nlp/test_bleu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 45169bacf68..c98143cf98d 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -32,6 +32,7 @@ def test_wrong_inputs():
 @pytest.mark.parametrize(
     "candidate, references",
     [
+        ([["a"], ["a"]]),
         ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
         corpus.sample_1,
         corpus.sample_2,

From 989f4bf71ac84945eb1178988daff9988b4518d8 Mon Sep 17 00:00:00 2001
From: Sylvain Desroziers <sylvain.desroziers@gmail.com>
Date: Mon, 22 Mar 2021 21:48:27 +0100
Subject: [PATCH 13/13] Resolve conflict

---
 ignite/metrics/nlp/rouge.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py
index 18f264fe09d..d0dede92335 100644
--- a/ignite/metrics/nlp/rouge.py
+++ b/ignite/metrics/nlp/rouge.py
@@ -139,10 +139,7 @@ def __init__(
     def _get_multiref_reducer(self) -> MultiRefReducer:
         if self._multiref == "average":
             return MultiRefAverageReducer()
-        elif self._multiref == "best":
-            return MultiRefBestReducer()
-        else:
-            raise ValueError(f"multiref : wrong value (got : {self._multiref})")
+        return MultiRefBestReducer()
 
     @reinit__is_reduced
     def reset(self) -> None: