From ffea5fc6c8d1ff32db289663709e550ed15e7ecf Mon Sep 17 00:00:00 2001 From: Desroziers Date: Sun, 21 Mar 2021 18:46:39 +0100 Subject: [PATCH 01/13] add bleu metric - refactor rouge and add nlp module --- ignite/metrics/nlp/__init__.py | 0 ignite/metrics/nlp/bleu.py | 186 +++++++++++++++ ignite/metrics/{ => nlp}/rouge.py | 0 ignite/metrics/nlp/utils.py | 0 tests/ignite/metrics/nlp/__init__.py | 0 tests/ignite/metrics/nlp/test_bleu.py | 239 +++++++++++++++++++ tests/ignite/metrics/{ => nlp}/test_rouge.py | 2 +- tests/ignite/metrics/nlp/test_utils.py | 0 8 files changed, 426 insertions(+), 1 deletion(-) create mode 100644 ignite/metrics/nlp/__init__.py create mode 100644 ignite/metrics/nlp/bleu.py rename ignite/metrics/{ => nlp}/rouge.py (100%) create mode 100644 ignite/metrics/nlp/utils.py create mode 100644 tests/ignite/metrics/nlp/__init__.py create mode 100644 tests/ignite/metrics/nlp/test_bleu.py rename tests/ignite/metrics/{ => nlp}/test_rouge.py (99%) create mode 100644 tests/ignite/metrics/nlp/test_utils.py diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py new file mode 100644 index 00000000000..d36e68bda65 --- /dev/null +++ b/ignite/metrics/nlp/bleu.py @@ -0,0 +1,186 @@ +import math +from collections import Counter +from typing import Callable, Sequence, Optional, Tuple, Union, Any + +import torch + +from ignite.exceptions import NotComputableError +from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce + +from ignite.metrics.nlp.utils import modified_precision + +__all__ = ["Bleu"] + + +def _closest_ref_length(references: Sequence[Sequence[Any]], hyp_len: int) -> int: + ref_lens = (len(reference) for reference in references) + closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)) + return closest_ref_len + + +class _Smoother: + """ + Smoothing helper + http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + """ + def __init__(self, method: str): + valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"] + if method not in valid: + raise ValueError(f"Smooth is not valid (expected: {valid}, got: {method})") + self.smooth = method + + def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float]: + method = getattr(self, self.smooth) + return method(numerators, denominators) + + @staticmethod + def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]: + epsilon = 0.1 + denominators = [max(1, d) for d in denominators.values()] + return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators)] + + @staticmethod + def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: + denominators = [max(1, d) for d in denominators.values()] + return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators)] + + @staticmethod + def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: + return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators.values())] + + @staticmethod + def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]: + denominators = [max(1, d) for d in denominators.values()] + return [n / d for n, d in zip(numerators.values(), denominators)] + + +class Bleu(Metric): + r"""Calculates the `BLEU score `_. + + .. math:: + \text{BLEU} = \text{BP} \dot exp \left( \sum_{n=1}^{N} w_{n} log p_{n} \right) + + where :math:`N` is the order of n-grams, :math:`\text{BP}` is a sentence brevety penalty, :math:`w_{n}` are + positive weights summing to one and :math:`p_{n}` are modified n-gram precisions. + + More details can be found in `Papineni et al. 2002`__. + + __ https://www.aclweb.org/anthology/P02-1040.pdf + + In addition, a review of smoothing techniques can be found in `Chen et al. 2014`__ + + __ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + + Remark : + + This implementation is inspired by nltk + + Args: + ngram: order of n-grams. + smooth: enable smoothing. Valid are "no_smooth", "smooth1", "nltk_smooth2" or "smooth2". (Default: "no_smooth") + output_transform: a callable that is used to transform the + :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the + form expected by the metric. This can be useful if, for example, you have a multi-output model and + you want to compute the metric with respect to one of the outputs. + By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``. + device: specifies which device updates are accumulated on. Setting the + metric's device to be the same as your ``update`` arguments ensures the ``update`` method is + non-blocking. By default, CPU. + + Example: + + .. code-block:: python + + from ignite.metrics import Bleu + m = Bleu(ngram=4, smooth="smooth1") + y_pred = "the the the the the the the" + y = ["the cat is on the mat", "there is a cat on the mat"] + m.update((y_pred.split(), [y.split()])) + print(m.compute()) + + .. versionadded:: 0.5.0 + """ + + def __init__( + self, + ngram: int = 4, + smooth: str = "no_smooth", + output_transform: Callable = lambda x: x, + device: Union[str, torch.device] = torch.device("cpu"), + ): + if ngram <= 0: + raise ValueError(f"ngram order must be greater than zero (got: {ngram})") + self.ngrams_order = ngram + self.weights = [1 / self.ngrams_order] * self.ngrams_order + self.smoother = _Smoother(method=smooth) + super(Bleu, self).__init__(output_transform=output_transform, device=device) + + def corpus_bleu( + self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], + ): + p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. + p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. + + assert len(references) == len(candidates), ( + "The number of hypotheses and their reference(s) should be the same " + ) + + # Iterate through each hypothesis and their corresponding references. + for refs, hyp in zip(references, candidates): + # For each order of ngram, calculate the numerator and + # denominator for the corpus-level modified precision. + for i in range(1, self.ngrams_order + 1): + numerator, denominator = modified_precision(refs, hyp, i) + p_numerators[i] += numerator + p_denominators[i] += denominator + + # Returns 0 if there's no matching n-grams + # We only need to check for p_numerators[1] == 0, since if there's + # no unigrams, there won't be any higher order ngrams. + if p_numerators[1] == 0: + return 0 + + # If no smoother, returns 0 if there's at least one a not matching n-grams + if self.smoother.smooth == "no_smooth" and min(p_numerators.values()) == 0: + return 0 + + # Calculate the hypothesis lengths + hyp_lengths = [len(hyp) for hyp in candidates] + + # Calculate the closest reference lengths. + ref_lengths = [_closest_ref_length(refs, hyp_len) for refs, hyp_len in zip(references, hyp_lengths)] + + # Sum of hypothesis and references lengths + hyp_len = sum(hyp_lengths) + ref_len = sum(ref_lengths) + + # Calculate corpus-level brevity penalty. + if hyp_len < ref_len: + bp = math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0 + else: + bp = 1.0 + + # Smoothing + p_n = self.smoother(p_numerators, p_denominators) + + # Compute the geometric mean + s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)] + s = bp * math.exp(math.fsum(s)) + return s + + @reinit__is_reduced + def reset(self) -> None: + self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device) + self._num_sentences = 0 + + @reinit__is_reduced + def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None: + y_pred, y = output + self._sum_of_bleu += self.corpus_bleu(references=[y], candidates=[y_pred]) + self._num_sentences += 1 + + @sync_all_reduce("_sum_of_bleu", "_num_sentences") + def compute(self) -> torch.Tensor: + if self._num_sentences == 0: + raise NotComputableError("Bleu must have at least one example before it can be computed.") + return self._sum_of_bleu / self._num_sentences diff --git a/ignite/metrics/rouge.py b/ignite/metrics/nlp/rouge.py similarity index 100% rename from ignite/metrics/rouge.py rename to ignite/metrics/nlp/rouge.py diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py new file mode 100644 index 00000000000..21aee7fad3e --- /dev/null +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -0,0 +1,239 @@ +import os +import pytest +import warnings + +from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction + +import torch + +import ignite.distributed as idist +from ignite.exceptions import NotComputableError +from ignite.metrics.nlp import Bleu + +from . import CorpusForTest + +corpus = CorpusForTest(lower_split=True) + + +def test_wrong_inputs(): + + with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): + Bleu(ngram=0) + + with pytest.raises(ValueError, match=r"Smooth is not valid"): + Bleu(smooth="fake") + + with pytest.raises(NotComputableError): + Bleu().compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu(candidate, references): + print(candidate, references) + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu(references, candidate, weights=weights) + bleu = Bleu(ngram=i) + assert pytest.approx(reference) == bleu.corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert pytest.approx(reference) == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_smooth1(candidate, references): + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu(references, candidate, + weights=weights, + smoothing_function=SmoothingFunction().method1) + bleu = Bleu(ngram=i, smooth="smooth1") + assert reference == bleu.corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_nltk_smooth2(candidate, references): + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu(references, candidate, + weights=weights, + smoothing_function=SmoothingFunction().method2) + bleu = Bleu(ngram=i, smooth="nltk_smooth2") + assert reference == bleu.corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_smooth2(candidate, references): + for i in range(1, 3): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu(references, candidate, + weights=weights, + smoothing_function=SmoothingFunction().method2) + bleu = Bleu(ngram=i, smooth="smooth2") + assert reference == bleu.corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +def test_bleu(): + bleu = Bleu(ngram=4, smooth="smooth2") + bleu.update((corpus.cand_1, corpus.references_1)) + bleu.update((corpus.cand_2a, corpus.references_2)) + bleu.update((corpus.cand_2b, corpus.references_2)) + bleu.update((corpus.cand_3, corpus.references_2)) + value = bleu.corpus_bleu([corpus.references_1], [corpus.cand_1]) + value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2a]) + value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2b]) + value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_3]) + assert bleu.compute() == value / 4 + + +def _test_distrib_integration(device): + + from ignite.engine import Engine + + rank = idist.get_rank() + + size = len(corpus.chunks) + + data = [] + for c in corpus.chunks: + data += idist.get_world_size() * [c] + + def update(_, i): + return data[i + size * rank] + + def _test(metric_device): + engine = Engine(update) + m = Bleu(ngram=4, smooth="smooth2") + m.attach(engine, "bleu") + + engine.run(data=list(range(size)), max_epochs=1) + + assert "bleu" in engine.state.metrics + + ref_bleu = 0 + for candidate, references in data: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ref_bleu += corpus_bleu([references], [candidate], + weights=[0.25, 0.25, 0.25, 0.25], + smoothing_function=SmoothingFunction().method2) + + assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data) + + _test("cpu") + + if device.type != "xla": + _test(idist.device()) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): + device = torch.device(f"cuda:{local_rank}") + _test_distrib_integration(device) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +def test_distrib_cpu(distributed_context_single_node_gloo): + device = torch.device("cpu") + _test_distrib_integration(device) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") +@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") +def test_distrib_hvd(gloo_hvd_executor): + + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + + gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): + device = torch.device("cpu") + _test_distrib_integration(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): + device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + _test_distrib_integration(device) + + +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") +@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") +def test_distrib_single_device_xla(): + device = idist.device() + _test_distrib_integration(device) + + +def _test_distrib_xla_nprocs(index): + device = idist.device() + _test_distrib_integration(device) + + +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") +@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") +def test_distrib_xla_nprocs(xmp_executor): + n = int(os.environ["NUM_TPU_WORKERS"]) + xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + diff --git a/tests/ignite/metrics/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py similarity index 99% rename from tests/ignite/metrics/test_rouge.py rename to tests/ignite/metrics/nlp/test_rouge.py index b8c5bc6e6f3..b83c97afaf1 100644 --- a/tests/ignite/metrics/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -8,7 +8,7 @@ import ignite.distributed as idist from ignite.exceptions import NotComputableError from ignite.metrics import Rouge -from ignite.metrics.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams +from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams nltk.download("punkt") diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py new file mode 100644 index 00000000000..e69de29bb2d From fd0df398e588b202ef128fd8592348ddfc8f91e0 Mon Sep 17 00:00:00 2001 From: Sylvain Desroziers Date: Sun, 21 Mar 2021 20:06:24 +0100 Subject: [PATCH 02/13] Remove blank line --- tests/ignite/metrics/nlp/test_bleu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 21aee7fad3e..36f998b419f 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -236,4 +236,3 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) - From 30c667ad161e6e7fd73d5a1022a75df4bdbe43e3 Mon Sep 17 00:00:00 2001 From: sdesrozis Date: Sun, 21 Mar 2021 19:07:14 +0000 Subject: [PATCH 03/13] autopep8 fix --- ignite/metrics/nlp/bleu.py | 8 +++---- tests/ignite/metrics/nlp/test_bleu.py | 32 ++++++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index d36e68bda65..b0706886778 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -1,12 +1,11 @@ import math from collections import Counter -from typing import Callable, Sequence, Optional, Tuple, Union, Any +from typing import Any, Callable, Optional, Sequence, Tuple, Union import torch from ignite.exceptions import NotComputableError from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce - from ignite.metrics.nlp.utils import modified_precision __all__ = ["Bleu"] @@ -23,6 +22,7 @@ class _Smoother: Smoothing helper http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf """ + def __init__(self, method: str): valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"] if method not in valid: @@ -121,9 +121,7 @@ def corpus_bleu( p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. - assert len(references) == len(candidates), ( - "The number of hypotheses and their reference(s) should be the same " - ) + assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same " # Iterate through each hypothesis and their corresponding references. for refs, hyp in zip(references, candidates): diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 36f998b419f..57171d60bbe 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -1,10 +1,9 @@ import os -import pytest import warnings -from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction - +import pytest import torch +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu import ignite.distributed as idist from ignite.exceptions import NotComputableError @@ -65,9 +64,9 @@ def test_corpus_bleu_smooth1(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method1) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1 + ) bleu = Bleu(ngram=i, smooth="smooth1") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -89,9 +88,9 @@ def test_corpus_bleu_nltk_smooth2(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method2) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) bleu = Bleu(ngram=i, smooth="nltk_smooth2") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -113,9 +112,9 @@ def test_corpus_bleu_smooth2(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method2) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) bleu = Bleu(ngram=i, smooth="smooth2") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -163,9 +162,12 @@ def _test(metric_device): for candidate, references in data: with warnings.catch_warnings(): warnings.simplefilter("ignore") - ref_bleu += corpus_bleu([references], [candidate], - weights=[0.25, 0.25, 0.25, 0.25], - smoothing_function=SmoothingFunction().method2) + ref_bleu += corpus_bleu( + [references], + [candidate], + weights=[0.25, 0.25, 0.25, 0.25], + smoothing_function=SmoothingFunction().method2, + ) assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data) From c9a0eef9d3a6ebe5f3bd698f9b9e7bdc4940037e Mon Sep 17 00:00:00 2001 From: Sylvain Desroziers Date: Sun, 21 Mar 2021 20:22:18 +0100 Subject: [PATCH 04/13] Remove metrics --- ignite/metrics/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py index 47765f677eb..61003635964 100644 --- a/ignite/metrics/__init__.py +++ b/ignite/metrics/__init__.py @@ -15,7 +15,6 @@ from ignite.metrics.psnr import PSNR from ignite.metrics.recall import Recall from ignite.metrics.root_mean_squared_error import RootMeanSquaredError -from ignite.metrics.rouge import Rouge, RougeL, RougeN from ignite.metrics.running_average import RunningAverage from ignite.metrics.ssim import SSIM from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy @@ -43,9 +42,6 @@ "PSNR", "Recall", "RootMeanSquaredError", - "Rouge", - "RougeN", - "RougeL", "RunningAverage", "VariableAccumulation", "Frequency", From 0cb54829936fd10fa0f49fbffb1f29beda396904 Mon Sep 17 00:00:00 2001 From: Sylvain Desroziers Date: Sun, 21 Mar 2021 20:26:45 +0100 Subject: [PATCH 05/13] Add nlp metrics in nlp.__init__ --- ignite/metrics/nlp/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py index e69de29bb2d..cc004b592f9 100644 --- a/ignite/metrics/nlp/__init__.py +++ b/ignite/metrics/nlp/__init__.py @@ -0,0 +1,9 @@ +from ignite.metrics.nlp.bleu import Bleu +from ignite.metrics.nlp.rouge import RougeN, RougeL, Rouge + +__all__ = [ + "Bleu", + "Rouge", + "RougeL", + "RougeN", +] From 59db6c0a2025c69ed09461feebe2449d4be1f01b Mon Sep 17 00:00:00 2001 From: sdesrozis Date: Sun, 21 Mar 2021 19:27:29 +0000 Subject: [PATCH 06/13] autopep8 fix --- ignite/metrics/nlp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py index cc004b592f9..20120ea7c0a 100644 --- a/ignite/metrics/nlp/__init__.py +++ b/ignite/metrics/nlp/__init__.py @@ -1,5 +1,5 @@ from ignite.metrics.nlp.bleu import Bleu -from ignite.metrics.nlp.rouge import RougeN, RougeL, Rouge +from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN __all__ = [ "Bleu", From 7aaa78910a50d57c9c449f0ea04e52d67e4ac3cc Mon Sep 17 00:00:00 2001 From: Desroziers Date: Mon, 22 Mar 2021 08:39:00 +0100 Subject: [PATCH 07/13] update --- ignite/metrics/__init__.py | 4 -- ignite/metrics/nlp/__init__.py | 9 +++ ignite/metrics/nlp/bleu.py | 32 ++++----- ignite/metrics/nlp/rouge.py | 52 ++------------- ignite/metrics/nlp/utils.py | 89 ++++++++++++++++++++++++++ tests/ignite/metrics/nlp/__init__.py | 61 ++++++++++++++++++ tests/ignite/metrics/nlp/test_bleu.py | 33 +++++----- tests/ignite/metrics/nlp/test_rouge.py | 81 +++-------------------- tests/ignite/metrics/nlp/test_utils.py | 57 +++++++++++++++++ 9 files changed, 260 insertions(+), 158 deletions(-) diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py index 47765f677eb..61003635964 100644 --- a/ignite/metrics/__init__.py +++ b/ignite/metrics/__init__.py @@ -15,7 +15,6 @@ from ignite.metrics.psnr import PSNR from ignite.metrics.recall import Recall from ignite.metrics.root_mean_squared_error import RootMeanSquaredError -from ignite.metrics.rouge import Rouge, RougeL, RougeN from ignite.metrics.running_average import RunningAverage from ignite.metrics.ssim import SSIM from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy @@ -43,9 +42,6 @@ "PSNR", "Recall", "RootMeanSquaredError", - "Rouge", - "RougeN", - "RougeL", "RunningAverage", "VariableAccumulation", "Frequency", diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py index e69de29bb2d..506f0bab51e 100644 --- a/ignite/metrics/nlp/__init__.py +++ b/ignite/metrics/nlp/__init__.py @@ -0,0 +1,9 @@ +from ignite.metrics.nlp.bleu import Bleu +from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN + +__all__ = [ + "Bleu", + "Rouge", + "RougeN", + "RougeL", +] diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index d36e68bda65..ff7f61607dd 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -1,12 +1,11 @@ import math from collections import Counter -from typing import Callable, Sequence, Optional, Tuple, Union, Any +from typing import Any, Callable, Optional, Sequence, Tuple, Union import torch from ignite.exceptions import NotComputableError from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce - from ignite.metrics.nlp.utils import modified_precision __all__ = ["Bleu"] @@ -23,6 +22,7 @@ class _Smoother: Smoothing helper http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf """ + def __init__(self, method: str): valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"] if method not in valid: @@ -36,13 +36,13 @@ def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float @staticmethod def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]: epsilon = 0.1 - denominators = [max(1, d) for d in denominators.values()] - return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators)] + denominators_ = [max(1, d) for d in denominators.values()] + return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators_)] @staticmethod def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: - denominators = [max(1, d) for d in denominators.values()] - return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators)] + denominators_ = [max(1, d) for d in denominators.values()] + return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators_)] @staticmethod def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: @@ -50,8 +50,8 @@ def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: @staticmethod def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]: - denominators = [max(1, d) for d in denominators.values()] - return [n / d for n, d in zip(numerators.values(), denominators)] + denominators_ = [max(1, d) for d in denominators.values()] + return [n / d for n, d in zip(numerators.values(), denominators_)] class Bleu(Metric): @@ -115,15 +115,11 @@ def __init__( self.smoother = _Smoother(method=smooth) super(Bleu, self).__init__(output_transform=output_transform, device=device) - def corpus_bleu( - self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], - ): - p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. - p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. + def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float: + p_numerators: Counter = Counter() # Key = ngram order, and value = no. of ngram matches. + p_denominators: Counter = Counter() # Key = ngram order, and value = no. of ngram in ref. - assert len(references) == len(candidates), ( - "The number of hypotheses and their reference(s) should be the same " - ) + assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same " # Iterate through each hypothesis and their corresponding references. for refs, hyp in zip(references, candidates): @@ -165,8 +161,8 @@ def corpus_bleu( # Compute the geometric mean s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)] - s = bp * math.exp(math.fsum(s)) - return s + gm = bp * math.exp(math.fsum(s)) + return gm @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py index 8c012a60596..18f264fe09d 100644 --- a/ignite/metrics/nlp/rouge.py +++ b/ignite/metrics/nlp/rouge.py @@ -1,5 +1,5 @@ from abc import ABCMeta, abstractmethod -from collections import Counter, namedtuple +from collections import namedtuple from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union import torch @@ -9,53 +9,9 @@ # These decorators helps with distributed settings from ignite.metrics.metric import reinit__is_reduced, sync_all_reduce +from ignite.metrics.nlp.utils import lcs, ngrams - -def ngrams(sequence: Sequence[Any], n: int) -> Counter: - """ - Generate the ngrams from a sequence of items - - Args: - sequence: sequence of items - n: ngram order - - Returns: - A counter of ngram objects - - .. versionadded:: 0.5.0 - """ - return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]) - - -def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int: - """ - Compute the length of the longest common subsequence in two sequence of items - https://en.wikipedia.org/wiki/Longest_common_subsequence_problem - - Args: - seq_a: first sequence of items - seq_b: second sequence of items - - Returns: - The length of the longest common subsequence - - .. versionadded:: 0.5.0 - """ - m = len(seq_a) - n = len(seq_b) - - dp = [[0] * (n + 1) for _ in range(m + 1)] - - for i in range(m + 1): - for j in range(n + 1): - if i == 0 or j == 0: - dp[i][j] = 0 - elif seq_a[i - 1] == seq_b[j - 1]: - dp[i][j] = dp[i - 1][j - 1] + 1 - else: - dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) - - return dp[m][n] +__all__ = ["Rouge", "RougeN", "RougeL"] class Score(namedtuple("Score", ["match", "candidate", "reference"])): @@ -289,7 +245,7 @@ def __init__( super(RougeN, self).__init__(multiref=multiref, alpha=alpha, output_transform=output_transform, device=device) self._ngram = ngram if self._ngram < 1: - raise ValueError(f"ngram order must be greater than one (got : {self._ngram})") + raise ValueError(f"ngram order must be greater than zero (got : {self._ngram})") def _compute_score(self, candidate: Sequence[Any], reference: Sequence[Any]) -> Score: return compute_ngram_scores(candidate=candidate, reference=reference, n=self._ngram) diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py index e69de29bb2d..90be7fbff1e 100644 --- a/ignite/metrics/nlp/utils.py +++ b/ignite/metrics/nlp/utils.py @@ -0,0 +1,89 @@ +from collections import Counter +from typing import Any, Sequence, Tuple + +__all__ = ["ngrams", "lcs", "modified_precision"] + + +def ngrams(sequence: Sequence[Any], n: int) -> Counter: + """ + Generate the ngrams from a sequence of items + + Args: + sequence: sequence of items + n: n-gram order + + Returns: + A counter of ngram objects + + .. versionadded:: 0.5.0 + """ + return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]) + + +def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int: + """ + Compute the length of the longest common subsequence in two sequence of items + https://en.wikipedia.org/wiki/Longest_common_subsequence_problem + + Args: + seq_a: first sequence of items + seq_b: second sequence of items + + Returns: + The length of the longest common subsequence + + .. versionadded:: 0.5.0 + """ + m = len(seq_a) + n = len(seq_b) + + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif seq_a[i - 1] == seq_b[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + return dp[m][n] + + +def modified_precision(references: Sequence[Sequence[Any]], candidate: Any, n: int) -> Tuple[int, int]: + """ + Compute the modified precision + + .. math:: + p_{n} = \frac{m_{n}}{l_{n}} + + where m_{n} is the number of matched n-grams between translation T and its reference R, and l_{n} is the + total number of n-grams in the translation T. + + More details can be found in `Papineni et al. 2002`__. + + __ https://www.aclweb.org/anthology/P02-1040.pdf + + Args: + references: list of references R + candidate: translation T + n: n-gram order + + Returns: + The length of the longest common subsequence + + .. versionadded:: 0.5.0 + """ + # ngrams of the candidate + counts = ngrams(candidate, n) + + # union of ngrams of references + max_counts: Counter = Counter() + for reference in references: + max_counts |= ngrams(reference, n) + + # clipped count of the candidate and references + clipped_counts = counts & max_counts + + return sum(clipped_counts.values()), sum(counts.values()) diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py index e69de29bb2d..e12bafd8508 100644 --- a/tests/ignite/metrics/nlp/__init__.py +++ b/tests/ignite/metrics/nlp/__init__.py @@ -0,0 +1,61 @@ +__all__ = ["CorpusForTest"] + + +class CorpusForTest: + def __init__(self, lower_split=False): + def preproc(text): + if lower_split: + return text.lower().split() + else: + return text + + # BLEU Paper examples + self.cand_1 = preproc("the the the the the the the") + self.ref_1a = preproc("The cat is on the mat") + self.ref_1b = preproc("There is a cat on the mat") + + self.cand_2a = preproc( + "It is a guide to action which ensures that the military always obeys the commands of the party" + ) + self.cand_2b = preproc("It is to insure the troops forever hearing the activity guidebook that " "party direct") + self.ref_2a = preproc( + "It is a guide to action that ensures that the military will forever heed " "Party commands" + ) + self.ref_2b = preproc( + "It is the guiding principle which guarantees the military forces always being under the command of " + "the Party" + ) + self.ref_2c = preproc("It is the practical guide for the army always to heed the directions of the party") + + self.cand_3 = preproc("of the") + + self.references_1 = [self.ref_1a, self.ref_1b] + self.references_2 = [self.ref_2a, self.ref_2b, self.ref_2c] + + self.sample_1 = ([self.cand_1], [self.references_1]) + self.sample_2 = ([self.cand_3], [self.references_2]) + self.sample_3 = ([self.cand_2a], [self.references_2]) + self.sample_4 = ([self.cand_2b], [self.references_2]) + self.sample_5 = ([self.cand_2a, self.cand_2b], [self.references_2, self.references_2]) + + self.references_3 = [self.ref_2a, self.ref_2b] + self.references_4 = [self.ref_2b, self.ref_2c] + self.references_5 = [self.ref_2a, self.ref_2c] + + self.chunks = [ + (self.cand_1, self.references_1), + (self.cand_2a, self.references_2), + (self.cand_2b, self.references_2), + (self.cand_1, [self.ref_1a]), + (self.cand_2a, self.references_3), + (self.cand_2b, self.references_3), + (self.cand_1, [self.ref_1b]), + (self.cand_2a, self.references_4), + (self.cand_2b, self.references_4), + (self.cand_1, self.references_1), + (self.cand_2a, self.references_5), + (self.cand_2b, self.references_5), + (self.cand_1, [self.ref_1a]), + (self.cand_2a, [self.ref_2a]), + (self.cand_2b, [self.ref_2c]), + ] diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 21aee7fad3e..57171d60bbe 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -1,10 +1,9 @@ import os -import pytest import warnings -from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction - +import pytest import torch +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu import ignite.distributed as idist from ignite.exceptions import NotComputableError @@ -65,9 +64,9 @@ def test_corpus_bleu_smooth1(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method1) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1 + ) bleu = Bleu(ngram=i, smooth="smooth1") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -89,9 +88,9 @@ def test_corpus_bleu_nltk_smooth2(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method2) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) bleu = Bleu(ngram=i, smooth="nltk_smooth2") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -113,9 +112,9 @@ def test_corpus_bleu_smooth2(candidate, references): weights = tuple([1 / i] * i) with warnings.catch_warnings(): warnings.simplefilter("ignore") - reference = corpus_bleu(references, candidate, - weights=weights, - smoothing_function=SmoothingFunction().method2) + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) bleu = Bleu(ngram=i, smooth="smooth2") assert reference == bleu.corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) @@ -163,9 +162,12 @@ def _test(metric_device): for candidate, references in data: with warnings.catch_warnings(): warnings.simplefilter("ignore") - ref_bleu += corpus_bleu([references], [candidate], - weights=[0.25, 0.25, 0.25, 0.25], - smoothing_function=SmoothingFunction().method2) + ref_bleu += corpus_bleu( + [references], + [candidate], + weights=[0.25, 0.25, 0.25, 0.25], + smoothing_function=SmoothingFunction().method2, + ) assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data) @@ -236,4 +238,3 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) - diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index b83c97afaf1..40aafae189c 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -7,36 +7,14 @@ import ignite.distributed as idist from ignite.exceptions import NotComputableError -from ignite.metrics import Rouge -from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams +from ignite.metrics.nlp import Rouge +from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores + +from . import CorpusForTest nltk.download("punkt") - -@pytest.mark.parametrize( - "sequence, n, expected_keys, expected_values", - [ - ([], 1, [], []), - ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]), - ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],), - ([0, 1, 2], 3, [(0, 1, 2)], [1]), - ([0, 0, 0], 1, [(0,)], [3]), - ([0, 0, 0], 2, [(0, 0)], [2]), - ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]), - ], -) -def test_ngrams(sequence, n, expected_keys, expected_values): - ngrams_counter = ngrams(sequence=sequence, n=n) - assert list(ngrams_counter.values()) == expected_values - assert list(ngrams_counter.keys()) == expected_keys - - -@pytest.mark.parametrize( - "seq_a, seq_b, expected", - [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),], -) -def test_lcs(seq_a, seq_b, expected): - assert lcs(seq_a, seq_b) == expected +corpus = CorpusForTest() @pytest.mark.parametrize( @@ -61,7 +39,7 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"ngram order must be greater than one"): + with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): RougeN(ngram=0) with pytest.raises(ValueError, match=r"alpha must be in interval \[0, 1\]"): @@ -106,31 +84,8 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected): assert results[f"Rouge-{ngram}-F"] == F -# BLEU Paper examples -CAND_1 = "the the the the the the the" -REF_1A = "The cat is on the mat" -REF_1B = "There is a cat on the mat" - -CAND_2A = "It is a guide to action which ensures that the military always obeys the " "commands of the party" -CAND_2B = "It is to insure the troops forever hearing the activity guidebook that " "party direct" -REF_2A = "It is a guide to action that ensures that the military will forever heed " "Party commands" -REF_2B = ( - "It is the guiding principle which guarantees the military forces always being under the " "command of the Party" -) -REF_2C = "It is the practical guide for the army always to heed the directions of the " "party" - -CAND_3 = "of the" - - @pytest.mark.parametrize( - "candidates, references", - [ - ([CAND_1], [[REF_1A, REF_1B]]), - ([CAND_3], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2A], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2B], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2A, CAND_2B], [[REF_2A, REF_2B, REF_2C], [REF_2A, REF_2B, REF_2C]]), - ], + "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5,], ) def test_rouge_metrics(candidates, references): for multiref in ["average", "best"]: @@ -171,28 +126,10 @@ def _test_distrib_integration(device): rank = idist.get_rank() - chunks = [ - (CAND_1, [REF_1A, REF_1B]), - (CAND_2A, [REF_2A, REF_2B, REF_2C]), - (CAND_2B, [REF_2A, REF_2B, REF_2C]), - (CAND_1, [REF_1A]), - (CAND_2A, [REF_2A, REF_2B]), - (CAND_2B, [REF_2A, REF_2B]), - (CAND_1, [REF_1B]), - (CAND_2A, [REF_2B, REF_2C]), - (CAND_2B, [REF_2B, REF_2C]), - (CAND_1, [REF_1A, REF_1B]), - (CAND_2A, [REF_2A, REF_2C]), - (CAND_2B, [REF_2A, REF_2C]), - (CAND_1, [REF_1A]), - (CAND_2A, [REF_2A]), - (CAND_2B, [REF_2C]), - ] - - size = len(chunks) + size = len(corpus.chunks) data = [] - for c in chunks: + for c in corpus.chunks: data += idist.get_world_size() * [c] def update(_, i): diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py index e69de29bb2d..8cf267a68bd 100644 --- a/tests/ignite/metrics/nlp/test_utils.py +++ b/tests/ignite/metrics/nlp/test_utils.py @@ -0,0 +1,57 @@ +import pytest + +from ignite.metrics.nlp.utils import lcs, modified_precision, ngrams + + +@pytest.mark.parametrize( + "sequence, n, expected_keys, expected_values", + [ + ([], 1, [], []), + ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]), + ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],), + ([0, 1, 2], 3, [(0, 1, 2)], [1]), + ([0, 0, 0], 1, [(0,)], [3]), + ([0, 0, 0], 2, [(0, 0)], [2]), + ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]), + ], +) +def test_ngrams(sequence, n, expected_keys, expected_values): + ngrams_counter = ngrams(sequence=sequence, n=n) + assert list(ngrams_counter.values()) == expected_values + assert list(ngrams_counter.keys()) == expected_keys + + +@pytest.mark.parametrize( + "seq_a, seq_b, expected", + [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),], +) +def test_lcs(seq_a, seq_b, expected): + assert lcs(seq_a, seq_b) == expected + + +def test_modified_precision_empty(): + for k in range(1, 5): + n, d = modified_precision([[]], [], k) + assert n == 0 and d == 0 + n, d = modified_precision([[]], [0], k) + assert n == 0 and d == (k == 1) + n, d = modified_precision([[0]], [], k) + assert n == 0 and d == 0 + n, d = modified_precision([[]], list(range(k)), k) + assert n == 0 and d == 1 + n, d = modified_precision([list(range(k))], [], k) + assert n == 0 and d == 0 + + +@pytest.mark.parametrize( + "references, candidate, expected", + [ + ([[0, 0, 0], [1, 2]], [1, 2, 3, 4], ((2, 4), (1, 3), (0, 2))), + ([[0, 1, 2], [0, 0, 3]], [0, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))), + ([[0, 1, 2], [3, 0, 3]], [3, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))), + ], +) +def test_modified_precision(references, candidate, expected): + for n, (e_n, e_d) in enumerate(expected, start=1): + n, d = modified_precision(references, candidate, n) + assert n == e_n and d == e_d From 2a3a81b89e6fd37a946dd240d99085becf7b3920 Mon Sep 17 00:00:00 2001 From: Desroziers Date: Mon, 22 Mar 2021 08:51:17 +0100 Subject: [PATCH 08/13] expose nlp - replace assert by exception --- ignite/metrics/__init__.py | 6 ++++++ ignite/metrics/nlp/bleu.py | 12 ++++++++---- tests/ignite/metrics/nlp/test_bleu.py | 3 +++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py index 61003635964..d743faf4f62 100644 --- a/ignite/metrics/__init__.py +++ b/ignite/metrics/__init__.py @@ -11,6 +11,8 @@ from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, MetricUsage from ignite.metrics.metrics_lambda import MetricsLambda from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix +from ignite.metrics.nlp.bleu import Bleu +from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN from ignite.metrics.precision import Precision from ignite.metrics.psnr import PSNR from ignite.metrics.recall import Recall @@ -46,4 +48,8 @@ "VariableAccumulation", "Frequency", "SSIM", + "Bleu", + "Rouge", + "RougeN", + "RougeL", ] diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index ff7f61607dd..fad3bed41e4 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -116,10 +116,14 @@ def __init__( super(Bleu, self).__init__(output_transform=output_transform, device=device) def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float: - p_numerators: Counter = Counter() # Key = ngram order, and value = no. of ngram matches. - p_denominators: Counter = Counter() # Key = ngram order, and value = no. of ngram in ref. - - assert len(references) == len(candidates), "The number of hypotheses and their reference(s) should be the same " + p_numerators: Counter = Counter() + p_denominators: Counter = Counter() + + if len(references) != len(candidates): + raise ValueError( + f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != " + f"{len(references)})" + ) # Iterate through each hypothesis and their corresponding references. for refs, hyp in zip(references, candidates): diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 57171d60bbe..69fd2551fcc 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -22,6 +22,9 @@ def test_wrong_inputs(): with pytest.raises(ValueError, match=r"Smooth is not valid"): Bleu(smooth="fake") + with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"): + Bleu().corpus_bleu(references=[[0], [0]], candidates=[[0]]) + with pytest.raises(NotComputableError): Bleu().compute() From 2d8af33f7425ffebde0dbbb883a9764607abbdc4 Mon Sep 17 00:00:00 2001 From: Desroziers Date: Mon, 22 Mar 2021 09:39:08 +0100 Subject: [PATCH 09/13] fix F401 --- ignite/metrics/nlp/bleu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index fad3bed41e4..0940184360e 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -1,6 +1,6 @@ import math from collections import Counter -from typing import Any, Callable, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Sequence, Tuple, Union import torch From ded5f80be6be96d678c048114532c81ed833a97f Mon Sep 17 00:00:00 2001 From: Desroziers Date: Mon, 22 Mar 2021 16:16:18 +0100 Subject: [PATCH 10/13] fix doc --- docs/source/metrics.rst | 7 ++++--- ignite/metrics/nlp/bleu.py | 17 +++++++++++------ tests/ignite/metrics/nlp/test_bleu.py | 18 +++++++++--------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index eddd6b713fd..3c37f5600b1 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -323,13 +323,14 @@ Complete list of metrics precision.Precision PSNR recall.Recall - Rouge - rouge.RougeL - rouge.RougeN RootMeanSquaredError RunningAverage SSIM TopKCategoricalAccuracy + Bleu + Rouge + RougeL + RougeN Helpers for customizing metrics ------------------------------- diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 0940184360e..6e94bca134d 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -58,9 +58,9 @@ class Bleu(Metric): r"""Calculates the `BLEU score `_. .. math:: - \text{BLEU} = \text{BP} \dot exp \left( \sum_{n=1}^{N} w_{n} log p_{n} \right) + \text{BLEU} = b_{p} \cdot \exp \left( \sum_{n=1}^{N} w_{n} \: \log p_{n} \right) - where :math:`N` is the order of n-grams, :math:`\text{BP}` is a sentence brevety penalty, :math:`w_{n}` are + where :math:`N` is the order of n-grams, :math:`b_{p}` is a sentence brevety penalty, :math:`w_{n}` are positive weights summing to one and :math:`p_{n}` are modified n-gram precisions. More details can be found in `Papineni et al. 2002`__. @@ -77,7 +77,8 @@ class Bleu(Metric): Args: ngram: order of n-grams. - smooth: enable smoothing. Valid are "no_smooth", "smooth1", "nltk_smooth2" or "smooth2". (Default: "no_smooth") + smooth: enable smoothing. Valid are ``no_smooth``, ``smooth1``, ``nltk_smooth2`` or ``smooth2``. + Default: ``no_smooth``. output_transform: a callable that is used to transform the :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the form expected by the metric. This can be useful if, for example, you have a multi-output model and @@ -91,11 +92,15 @@ class Bleu(Metric): .. code-block:: python - from ignite.metrics import Bleu + from ignite.metrics.nlp import Bleu + m = Bleu(ngram=4, smooth="smooth1") + y_pred = "the the the the the the the" y = ["the cat is on the mat", "there is a cat on the mat"] + m.update((y_pred.split(), [y.split()])) + print(m.compute()) .. versionadded:: 0.5.0 @@ -115,7 +120,7 @@ def __init__( self.smoother = _Smoother(method=smooth) super(Bleu, self).__init__(output_transform=output_transform, device=device) - def corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float: + def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], ) -> float: p_numerators: Counter = Counter() p_denominators: Counter = Counter() @@ -176,7 +181,7 @@ def reset(self) -> None: @reinit__is_reduced def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None: y_pred, y = output - self._sum_of_bleu += self.corpus_bleu(references=[y], candidates=[y_pred]) + self._sum_of_bleu += self._corpus_bleu(references=[y], candidates=[y_pred]) self._num_sentences += 1 @sync_all_reduce("_sum_of_bleu", "_num_sentences") diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 69fd2551fcc..45169bacf68 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -23,7 +23,7 @@ def test_wrong_inputs(): Bleu(smooth="fake") with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"): - Bleu().corpus_bleu(references=[[0], [0]], candidates=[[0]]) + Bleu()._corpus_bleu(references=[[0], [0]], candidates=[[0]]) with pytest.raises(NotComputableError): Bleu().compute() @@ -47,7 +47,7 @@ def test_corpus_bleu(candidate, references): warnings.simplefilter("ignore") reference = corpus_bleu(references, candidate, weights=weights) bleu = Bleu(ngram=i) - assert pytest.approx(reference) == bleu.corpus_bleu(references, candidate) + assert pytest.approx(reference) == bleu._corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) assert pytest.approx(reference) == bleu.compute() @@ -71,7 +71,7 @@ def test_corpus_bleu_smooth1(candidate, references): references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1 ) bleu = Bleu(ngram=i, smooth="smooth1") - assert reference == bleu.corpus_bleu(references, candidate) + assert reference == bleu._corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) assert reference == bleu.compute() @@ -95,7 +95,7 @@ def test_corpus_bleu_nltk_smooth2(candidate, references): references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 ) bleu = Bleu(ngram=i, smooth="nltk_smooth2") - assert reference == bleu.corpus_bleu(references, candidate) + assert reference == bleu._corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) assert reference == bleu.compute() @@ -119,7 +119,7 @@ def test_corpus_bleu_smooth2(candidate, references): references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 ) bleu = Bleu(ngram=i, smooth="smooth2") - assert reference == bleu.corpus_bleu(references, candidate) + assert reference == bleu._corpus_bleu(references, candidate) bleu.update((candidate[0], references[0])) assert reference == bleu.compute() @@ -130,10 +130,10 @@ def test_bleu(): bleu.update((corpus.cand_2a, corpus.references_2)) bleu.update((corpus.cand_2b, corpus.references_2)) bleu.update((corpus.cand_3, corpus.references_2)) - value = bleu.corpus_bleu([corpus.references_1], [corpus.cand_1]) - value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2a]) - value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_2b]) - value += bleu.corpus_bleu([corpus.references_2], [corpus.cand_3]) + value = bleu._corpus_bleu([corpus.references_1], [corpus.cand_1]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2a]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2b]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_3]) assert bleu.compute() == value / 4 From 219a19f2838e3504479b19ffd6d14d4c5f9c7c1c Mon Sep 17 00:00:00 2001 From: sdesrozis Date: Mon, 22 Mar 2021 15:18:13 +0000 Subject: [PATCH 11/13] autopep8 fix --- ignite/metrics/nlp/bleu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 6e94bca134d..90be793d9fd 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -120,7 +120,7 @@ def __init__( self.smoother = _Smoother(method=smooth) super(Bleu, self).__init__(output_transform=output_transform, device=device) - def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]], ) -> float: + def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float: p_numerators: Counter = Counter() p_denominators: Counter = Counter() From 1bf1e9c97dffe58784bcddbac9c0cd9f807b7dbc Mon Sep 17 00:00:00 2001 From: Desroziers Date: Mon, 22 Mar 2021 16:42:01 +0100 Subject: [PATCH 12/13] add test --- tests/ignite/metrics/nlp/test_bleu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 45169bacf68..c98143cf98d 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -32,6 +32,7 @@ def test_wrong_inputs(): @pytest.mark.parametrize( "candidate, references", [ + ([["a"], ["a"]]), ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), corpus.sample_1, corpus.sample_2, From 989f4bf71ac84945eb1178988daff9988b4518d8 Mon Sep 17 00:00:00 2001 From: Sylvain Desroziers Date: Mon, 22 Mar 2021 21:48:27 +0100 Subject: [PATCH 13/13] Resolve conflict --- ignite/metrics/nlp/rouge.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py index 18f264fe09d..d0dede92335 100644 --- a/ignite/metrics/nlp/rouge.py +++ b/ignite/metrics/nlp/rouge.py @@ -139,10 +139,7 @@ def __init__( def _get_multiref_reducer(self) -> MultiRefReducer: if self._multiref == "average": return MultiRefAverageReducer() - elif self._multiref == "best": - return MultiRefBestReducer() - else: - raise ValueError(f"multiref : wrong value (got : {self._multiref})") + return MultiRefBestReducer() @reinit__is_reduced def reset(self) -> None: