From f584e8a9660dede63c3f9717ad4cb861e9cebd7b Mon Sep 17 00:00:00 2001 From: Sylvain Desroziers Date: Tue, 23 Mar 2021 12:10:41 +0100 Subject: [PATCH] Add bleu metric (#1834) * add bleu metric - refactor rouge and add nlp module * Remove blank line * autopep8 fix * Remove metrics * Add nlp metrics in nlp.__init__ * autopep8 fix * update * expose nlp - replace assert by exception * fix F401 * fix doc * autopep8 fix * add test * Resolve conflict Co-authored-by: Desroziers Co-authored-by: sdesrozis Co-authored-by: vfdev --- docs/source/metrics.rst | 7 +- ignite/metrics/__init__.py | 10 +- ignite/metrics/nlp/__init__.py | 9 + ignite/metrics/nlp/bleu.py | 191 +++++++++++++++ ignite/metrics/{ => nlp}/rouge.py | 52 +--- ignite/metrics/nlp/utils.py | 89 +++++++ tests/ignite/metrics/nlp/__init__.py | 61 +++++ tests/ignite/metrics/nlp/test_bleu.py | 244 +++++++++++++++++++ tests/ignite/metrics/{ => nlp}/test_rouge.py | 81 +----- tests/ignite/metrics/nlp/test_utils.py | 57 +++++ 10 files changed, 674 insertions(+), 127 deletions(-) create mode 100644 ignite/metrics/nlp/__init__.py create mode 100644 ignite/metrics/nlp/bleu.py rename ignite/metrics/{ => nlp}/rouge.py (91%) create mode 100644 ignite/metrics/nlp/utils.py create mode 100644 tests/ignite/metrics/nlp/__init__.py create mode 100644 tests/ignite/metrics/nlp/test_bleu.py rename tests/ignite/metrics/{ => nlp}/test_rouge.py (75%) create mode 100644 tests/ignite/metrics/nlp/test_utils.py diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index c47a68cf9ca..5af9020e3d4 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -325,13 +325,14 @@ Complete list of metrics precision.Precision PSNR recall.Recall - Rouge - rouge.RougeL - rouge.RougeN RootMeanSquaredError RunningAverage SSIM TopKCategoricalAccuracy + Bleu + Rouge + RougeL + RougeN Helpers for customizing metrics ------------------------------- diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py index 47765f677eb..d743faf4f62 100644 --- a/ignite/metrics/__init__.py +++ b/ignite/metrics/__init__.py @@ -11,11 +11,12 @@ from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, MetricUsage from ignite.metrics.metrics_lambda import MetricsLambda from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix +from ignite.metrics.nlp.bleu import Bleu +from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN from ignite.metrics.precision import Precision from ignite.metrics.psnr import PSNR from ignite.metrics.recall import Recall from ignite.metrics.root_mean_squared_error import RootMeanSquaredError -from ignite.metrics.rouge import Rouge, RougeL, RougeN from ignite.metrics.running_average import RunningAverage from ignite.metrics.ssim import SSIM from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy @@ -43,11 +44,12 @@ "PSNR", "Recall", "RootMeanSquaredError", - "Rouge", - "RougeN", - "RougeL", "RunningAverage", "VariableAccumulation", "Frequency", "SSIM", + "Bleu", + "Rouge", + "RougeN", + "RougeL", ] diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py new file mode 100644 index 00000000000..506f0bab51e --- /dev/null +++ b/ignite/metrics/nlp/__init__.py @@ -0,0 +1,9 @@ +from ignite.metrics.nlp.bleu import Bleu +from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN + +__all__ = [ + "Bleu", + "Rouge", + "RougeN", + "RougeL", +] diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py new file mode 100644 index 00000000000..90be793d9fd --- /dev/null +++ b/ignite/metrics/nlp/bleu.py @@ -0,0 +1,191 @@ +import math +from collections import Counter +from typing import Any, Callable, Sequence, Tuple, Union + +import torch + +from ignite.exceptions import NotComputableError +from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce +from ignite.metrics.nlp.utils import modified_precision + +__all__ = ["Bleu"] + + +def _closest_ref_length(references: Sequence[Sequence[Any]], hyp_len: int) -> int: + ref_lens = (len(reference) for reference in references) + closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)) + return closest_ref_len + + +class _Smoother: + """ + Smoothing helper + http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + """ + + def __init__(self, method: str): + valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"] + if method not in valid: + raise ValueError(f"Smooth is not valid (expected: {valid}, got: {method})") + self.smooth = method + + def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float]: + method = getattr(self, self.smooth) + return method(numerators, denominators) + + @staticmethod + def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]: + epsilon = 0.1 + denominators_ = [max(1, d) for d in denominators.values()] + return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators_)] + + @staticmethod + def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: + denominators_ = [max(1, d) for d in denominators.values()] + return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators_)] + + @staticmethod + def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]: + return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators.values())] + + @staticmethod + def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]: + denominators_ = [max(1, d) for d in denominators.values()] + return [n / d for n, d in zip(numerators.values(), denominators_)] + + +class Bleu(Metric): + r"""Calculates the `BLEU score `_. + + .. math:: + \text{BLEU} = b_{p} \cdot \exp \left( \sum_{n=1}^{N} w_{n} \: \log p_{n} \right) + + where :math:`N` is the order of n-grams, :math:`b_{p}` is a sentence brevety penalty, :math:`w_{n}` are + positive weights summing to one and :math:`p_{n}` are modified n-gram precisions. + + More details can be found in `Papineni et al. 2002`__. + + __ https://www.aclweb.org/anthology/P02-1040.pdf + + In addition, a review of smoothing techniques can be found in `Chen et al. 2014`__ + + __ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + + Remark : + + This implementation is inspired by nltk + + Args: + ngram: order of n-grams. + smooth: enable smoothing. Valid are ``no_smooth``, ``smooth1``, ``nltk_smooth2`` or ``smooth2``. + Default: ``no_smooth``. + output_transform: a callable that is used to transform the + :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the + form expected by the metric. This can be useful if, for example, you have a multi-output model and + you want to compute the metric with respect to one of the outputs. + By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``. + device: specifies which device updates are accumulated on. Setting the + metric's device to be the same as your ``update`` arguments ensures the ``update`` method is + non-blocking. By default, CPU. + + Example: + + .. code-block:: python + + from ignite.metrics.nlp import Bleu + + m = Bleu(ngram=4, smooth="smooth1") + + y_pred = "the the the the the the the" + y = ["the cat is on the mat", "there is a cat on the mat"] + + m.update((y_pred.split(), [y.split()])) + + print(m.compute()) + + .. versionadded:: 0.5.0 + """ + + def __init__( + self, + ngram: int = 4, + smooth: str = "no_smooth", + output_transform: Callable = lambda x: x, + device: Union[str, torch.device] = torch.device("cpu"), + ): + if ngram <= 0: + raise ValueError(f"ngram order must be greater than zero (got: {ngram})") + self.ngrams_order = ngram + self.weights = [1 / self.ngrams_order] * self.ngrams_order + self.smoother = _Smoother(method=smooth) + super(Bleu, self).__init__(output_transform=output_transform, device=device) + + def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float: + p_numerators: Counter = Counter() + p_denominators: Counter = Counter() + + if len(references) != len(candidates): + raise ValueError( + f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != " + f"{len(references)})" + ) + + # Iterate through each hypothesis and their corresponding references. + for refs, hyp in zip(references, candidates): + # For each order of ngram, calculate the numerator and + # denominator for the corpus-level modified precision. + for i in range(1, self.ngrams_order + 1): + numerator, denominator = modified_precision(refs, hyp, i) + p_numerators[i] += numerator + p_denominators[i] += denominator + + # Returns 0 if there's no matching n-grams + # We only need to check for p_numerators[1] == 0, since if there's + # no unigrams, there won't be any higher order ngrams. + if p_numerators[1] == 0: + return 0 + + # If no smoother, returns 0 if there's at least one a not matching n-grams + if self.smoother.smooth == "no_smooth" and min(p_numerators.values()) == 0: + return 0 + + # Calculate the hypothesis lengths + hyp_lengths = [len(hyp) for hyp in candidates] + + # Calculate the closest reference lengths. + ref_lengths = [_closest_ref_length(refs, hyp_len) for refs, hyp_len in zip(references, hyp_lengths)] + + # Sum of hypothesis and references lengths + hyp_len = sum(hyp_lengths) + ref_len = sum(ref_lengths) + + # Calculate corpus-level brevity penalty. + if hyp_len < ref_len: + bp = math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0 + else: + bp = 1.0 + + # Smoothing + p_n = self.smoother(p_numerators, p_denominators) + + # Compute the geometric mean + s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)] + gm = bp * math.exp(math.fsum(s)) + return gm + + @reinit__is_reduced + def reset(self) -> None: + self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device) + self._num_sentences = 0 + + @reinit__is_reduced + def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None: + y_pred, y = output + self._sum_of_bleu += self._corpus_bleu(references=[y], candidates=[y_pred]) + self._num_sentences += 1 + + @sync_all_reduce("_sum_of_bleu", "_num_sentences") + def compute(self) -> torch.Tensor: + if self._num_sentences == 0: + raise NotComputableError("Bleu must have at least one example before it can be computed.") + return self._sum_of_bleu / self._num_sentences diff --git a/ignite/metrics/rouge.py b/ignite/metrics/nlp/rouge.py similarity index 91% rename from ignite/metrics/rouge.py rename to ignite/metrics/nlp/rouge.py index 476d97bd807..d0dede92335 100644 --- a/ignite/metrics/rouge.py +++ b/ignite/metrics/nlp/rouge.py @@ -1,5 +1,5 @@ from abc import ABCMeta, abstractmethod -from collections import Counter, namedtuple +from collections import namedtuple from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union import torch @@ -9,53 +9,9 @@ # These decorators helps with distributed settings from ignite.metrics.metric import reinit__is_reduced, sync_all_reduce +from ignite.metrics.nlp.utils import lcs, ngrams - -def ngrams(sequence: Sequence[Any], n: int) -> Counter: - """ - Generate the ngrams from a sequence of items - - Args: - sequence: sequence of items - n: ngram order - - Returns: - A counter of ngram objects - - .. versionadded:: 0.5.0 - """ - return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]) - - -def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int: - """ - Compute the length of the longest common subsequence in two sequence of items - https://en.wikipedia.org/wiki/Longest_common_subsequence_problem - - Args: - seq_a: first sequence of items - seq_b: second sequence of items - - Returns: - The length of the longest common subsequence - - .. versionadded:: 0.5.0 - """ - m = len(seq_a) - n = len(seq_b) - - dp = [[0] * (n + 1) for _ in range(m + 1)] - - for i in range(m + 1): - for j in range(n + 1): - if i == 0 or j == 0: - dp[i][j] = 0 - elif seq_a[i - 1] == seq_b[j - 1]: - dp[i][j] = dp[i - 1][j - 1] + 1 - else: - dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) - - return dp[m][n] +__all__ = ["Rouge", "RougeN", "RougeL"] class Score(namedtuple("Score", ["match", "candidate", "reference"])): @@ -286,7 +242,7 @@ def __init__( super(RougeN, self).__init__(multiref=multiref, alpha=alpha, output_transform=output_transform, device=device) self._ngram = ngram if self._ngram < 1: - raise ValueError(f"ngram order must be greater than one (got : {self._ngram})") + raise ValueError(f"ngram order must be greater than zero (got : {self._ngram})") def _compute_score(self, candidate: Sequence[Any], reference: Sequence[Any]) -> Score: return compute_ngram_scores(candidate=candidate, reference=reference, n=self._ngram) diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py new file mode 100644 index 00000000000..90be7fbff1e --- /dev/null +++ b/ignite/metrics/nlp/utils.py @@ -0,0 +1,89 @@ +from collections import Counter +from typing import Any, Sequence, Tuple + +__all__ = ["ngrams", "lcs", "modified_precision"] + + +def ngrams(sequence: Sequence[Any], n: int) -> Counter: + """ + Generate the ngrams from a sequence of items + + Args: + sequence: sequence of items + n: n-gram order + + Returns: + A counter of ngram objects + + .. versionadded:: 0.5.0 + """ + return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]) + + +def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int: + """ + Compute the length of the longest common subsequence in two sequence of items + https://en.wikipedia.org/wiki/Longest_common_subsequence_problem + + Args: + seq_a: first sequence of items + seq_b: second sequence of items + + Returns: + The length of the longest common subsequence + + .. versionadded:: 0.5.0 + """ + m = len(seq_a) + n = len(seq_b) + + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif seq_a[i - 1] == seq_b[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + return dp[m][n] + + +def modified_precision(references: Sequence[Sequence[Any]], candidate: Any, n: int) -> Tuple[int, int]: + """ + Compute the modified precision + + .. math:: + p_{n} = \frac{m_{n}}{l_{n}} + + where m_{n} is the number of matched n-grams between translation T and its reference R, and l_{n} is the + total number of n-grams in the translation T. + + More details can be found in `Papineni et al. 2002`__. + + __ https://www.aclweb.org/anthology/P02-1040.pdf + + Args: + references: list of references R + candidate: translation T + n: n-gram order + + Returns: + The length of the longest common subsequence + + .. versionadded:: 0.5.0 + """ + # ngrams of the candidate + counts = ngrams(candidate, n) + + # union of ngrams of references + max_counts: Counter = Counter() + for reference in references: + max_counts |= ngrams(reference, n) + + # clipped count of the candidate and references + clipped_counts = counts & max_counts + + return sum(clipped_counts.values()), sum(counts.values()) diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py new file mode 100644 index 00000000000..e12bafd8508 --- /dev/null +++ b/tests/ignite/metrics/nlp/__init__.py @@ -0,0 +1,61 @@ +__all__ = ["CorpusForTest"] + + +class CorpusForTest: + def __init__(self, lower_split=False): + def preproc(text): + if lower_split: + return text.lower().split() + else: + return text + + # BLEU Paper examples + self.cand_1 = preproc("the the the the the the the") + self.ref_1a = preproc("The cat is on the mat") + self.ref_1b = preproc("There is a cat on the mat") + + self.cand_2a = preproc( + "It is a guide to action which ensures that the military always obeys the commands of the party" + ) + self.cand_2b = preproc("It is to insure the troops forever hearing the activity guidebook that " "party direct") + self.ref_2a = preproc( + "It is a guide to action that ensures that the military will forever heed " "Party commands" + ) + self.ref_2b = preproc( + "It is the guiding principle which guarantees the military forces always being under the command of " + "the Party" + ) + self.ref_2c = preproc("It is the practical guide for the army always to heed the directions of the party") + + self.cand_3 = preproc("of the") + + self.references_1 = [self.ref_1a, self.ref_1b] + self.references_2 = [self.ref_2a, self.ref_2b, self.ref_2c] + + self.sample_1 = ([self.cand_1], [self.references_1]) + self.sample_2 = ([self.cand_3], [self.references_2]) + self.sample_3 = ([self.cand_2a], [self.references_2]) + self.sample_4 = ([self.cand_2b], [self.references_2]) + self.sample_5 = ([self.cand_2a, self.cand_2b], [self.references_2, self.references_2]) + + self.references_3 = [self.ref_2a, self.ref_2b] + self.references_4 = [self.ref_2b, self.ref_2c] + self.references_5 = [self.ref_2a, self.ref_2c] + + self.chunks = [ + (self.cand_1, self.references_1), + (self.cand_2a, self.references_2), + (self.cand_2b, self.references_2), + (self.cand_1, [self.ref_1a]), + (self.cand_2a, self.references_3), + (self.cand_2b, self.references_3), + (self.cand_1, [self.ref_1b]), + (self.cand_2a, self.references_4), + (self.cand_2b, self.references_4), + (self.cand_1, self.references_1), + (self.cand_2a, self.references_5), + (self.cand_2b, self.references_5), + (self.cand_1, [self.ref_1a]), + (self.cand_2a, [self.ref_2a]), + (self.cand_2b, [self.ref_2c]), + ] diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py new file mode 100644 index 00000000000..c98143cf98d --- /dev/null +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -0,0 +1,244 @@ +import os +import warnings + +import pytest +import torch +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu + +import ignite.distributed as idist +from ignite.exceptions import NotComputableError +from ignite.metrics.nlp import Bleu + +from . import CorpusForTest + +corpus = CorpusForTest(lower_split=True) + + +def test_wrong_inputs(): + + with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): + Bleu(ngram=0) + + with pytest.raises(ValueError, match=r"Smooth is not valid"): + Bleu(smooth="fake") + + with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"): + Bleu()._corpus_bleu(references=[[0], [0]], candidates=[[0]]) + + with pytest.raises(NotComputableError): + Bleu().compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a"], ["a"]]), + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu(candidate, references): + print(candidate, references) + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu(references, candidate, weights=weights) + bleu = Bleu(ngram=i) + assert pytest.approx(reference) == bleu._corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert pytest.approx(reference) == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_smooth1(candidate, references): + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1 + ) + bleu = Bleu(ngram=i, smooth="smooth1") + assert reference == bleu._corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_nltk_smooth2(candidate, references): + for i in range(1, 8): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) + bleu = Bleu(ngram=i, smooth="nltk_smooth2") + assert reference == bleu._corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +@pytest.mark.parametrize( + "candidate, references", + [ + ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]), + corpus.sample_1, + corpus.sample_2, + corpus.sample_3, + corpus.sample_4, + ], +) +def test_corpus_bleu_smooth2(candidate, references): + for i in range(1, 3): + weights = tuple([1 / i] * i) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + reference = corpus_bleu( + references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2 + ) + bleu = Bleu(ngram=i, smooth="smooth2") + assert reference == bleu._corpus_bleu(references, candidate) + bleu.update((candidate[0], references[0])) + assert reference == bleu.compute() + + +def test_bleu(): + bleu = Bleu(ngram=4, smooth="smooth2") + bleu.update((corpus.cand_1, corpus.references_1)) + bleu.update((corpus.cand_2a, corpus.references_2)) + bleu.update((corpus.cand_2b, corpus.references_2)) + bleu.update((corpus.cand_3, corpus.references_2)) + value = bleu._corpus_bleu([corpus.references_1], [corpus.cand_1]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2a]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2b]) + value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_3]) + assert bleu.compute() == value / 4 + + +def _test_distrib_integration(device): + + from ignite.engine import Engine + + rank = idist.get_rank() + + size = len(corpus.chunks) + + data = [] + for c in corpus.chunks: + data += idist.get_world_size() * [c] + + def update(_, i): + return data[i + size * rank] + + def _test(metric_device): + engine = Engine(update) + m = Bleu(ngram=4, smooth="smooth2") + m.attach(engine, "bleu") + + engine.run(data=list(range(size)), max_epochs=1) + + assert "bleu" in engine.state.metrics + + ref_bleu = 0 + for candidate, references in data: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ref_bleu += corpus_bleu( + [references], + [candidate], + weights=[0.25, 0.25, 0.25, 0.25], + smoothing_function=SmoothingFunction().method2, + ) + + assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data) + + _test("cpu") + + if device.type != "xla": + _test(idist.device()) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): + device = torch.device(f"cuda:{local_rank}") + _test_distrib_integration(device) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +def test_distrib_cpu(distributed_context_single_node_gloo): + device = torch.device("cpu") + _test_distrib_integration(device) + + +@pytest.mark.distributed +@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") +@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") +def test_distrib_hvd(gloo_hvd_executor): + + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + + gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): + device = torch.device("cpu") + _test_distrib_integration(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): + device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + _test_distrib_integration(device) + + +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") +@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") +def test_distrib_single_device_xla(): + device = idist.device() + _test_distrib_integration(device) + + +def _test_distrib_xla_nprocs(index): + device = idist.device() + _test_distrib_integration(device) + + +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") +@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") +def test_distrib_xla_nprocs(xmp_executor): + n = int(os.environ["NUM_TPU_WORKERS"]) + xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) diff --git a/tests/ignite/metrics/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py similarity index 75% rename from tests/ignite/metrics/test_rouge.py rename to tests/ignite/metrics/nlp/test_rouge.py index b8c5bc6e6f3..40aafae189c 100644 --- a/tests/ignite/metrics/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -7,36 +7,14 @@ import ignite.distributed as idist from ignite.exceptions import NotComputableError -from ignite.metrics import Rouge -from ignite.metrics.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams +from ignite.metrics.nlp import Rouge +from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores + +from . import CorpusForTest nltk.download("punkt") - -@pytest.mark.parametrize( - "sequence, n, expected_keys, expected_values", - [ - ([], 1, [], []), - ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]), - ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],), - ([0, 1, 2], 3, [(0, 1, 2)], [1]), - ([0, 0, 0], 1, [(0,)], [3]), - ([0, 0, 0], 2, [(0, 0)], [2]), - ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]), - ], -) -def test_ngrams(sequence, n, expected_keys, expected_values): - ngrams_counter = ngrams(sequence=sequence, n=n) - assert list(ngrams_counter.values()) == expected_values - assert list(ngrams_counter.keys()) == expected_keys - - -@pytest.mark.parametrize( - "seq_a, seq_b, expected", - [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),], -) -def test_lcs(seq_a, seq_b, expected): - assert lcs(seq_a, seq_b) == expected +corpus = CorpusForTest() @pytest.mark.parametrize( @@ -61,7 +39,7 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"ngram order must be greater than one"): + with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): RougeN(ngram=0) with pytest.raises(ValueError, match=r"alpha must be in interval \[0, 1\]"): @@ -106,31 +84,8 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected): assert results[f"Rouge-{ngram}-F"] == F -# BLEU Paper examples -CAND_1 = "the the the the the the the" -REF_1A = "The cat is on the mat" -REF_1B = "There is a cat on the mat" - -CAND_2A = "It is a guide to action which ensures that the military always obeys the " "commands of the party" -CAND_2B = "It is to insure the troops forever hearing the activity guidebook that " "party direct" -REF_2A = "It is a guide to action that ensures that the military will forever heed " "Party commands" -REF_2B = ( - "It is the guiding principle which guarantees the military forces always being under the " "command of the Party" -) -REF_2C = "It is the practical guide for the army always to heed the directions of the " "party" - -CAND_3 = "of the" - - @pytest.mark.parametrize( - "candidates, references", - [ - ([CAND_1], [[REF_1A, REF_1B]]), - ([CAND_3], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2A], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2B], [[REF_2A, REF_2B, REF_2C]]), - ([CAND_2A, CAND_2B], [[REF_2A, REF_2B, REF_2C], [REF_2A, REF_2B, REF_2C]]), - ], + "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5,], ) def test_rouge_metrics(candidates, references): for multiref in ["average", "best"]: @@ -171,28 +126,10 @@ def _test_distrib_integration(device): rank = idist.get_rank() - chunks = [ - (CAND_1, [REF_1A, REF_1B]), - (CAND_2A, [REF_2A, REF_2B, REF_2C]), - (CAND_2B, [REF_2A, REF_2B, REF_2C]), - (CAND_1, [REF_1A]), - (CAND_2A, [REF_2A, REF_2B]), - (CAND_2B, [REF_2A, REF_2B]), - (CAND_1, [REF_1B]), - (CAND_2A, [REF_2B, REF_2C]), - (CAND_2B, [REF_2B, REF_2C]), - (CAND_1, [REF_1A, REF_1B]), - (CAND_2A, [REF_2A, REF_2C]), - (CAND_2B, [REF_2A, REF_2C]), - (CAND_1, [REF_1A]), - (CAND_2A, [REF_2A]), - (CAND_2B, [REF_2C]), - ] - - size = len(chunks) + size = len(corpus.chunks) data = [] - for c in chunks: + for c in corpus.chunks: data += idist.get_world_size() * [c] def update(_, i): diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py new file mode 100644 index 00000000000..8cf267a68bd --- /dev/null +++ b/tests/ignite/metrics/nlp/test_utils.py @@ -0,0 +1,57 @@ +import pytest + +from ignite.metrics.nlp.utils import lcs, modified_precision, ngrams + + +@pytest.mark.parametrize( + "sequence, n, expected_keys, expected_values", + [ + ([], 1, [], []), + ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]), + ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],), + ([0, 1, 2], 3, [(0, 1, 2)], [1]), + ([0, 0, 0], 1, [(0,)], [3]), + ([0, 0, 0], 2, [(0, 0)], [2]), + ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]), + ], +) +def test_ngrams(sequence, n, expected_keys, expected_values): + ngrams_counter = ngrams(sequence=sequence, n=n) + assert list(ngrams_counter.values()) == expected_values + assert list(ngrams_counter.keys()) == expected_keys + + +@pytest.mark.parametrize( + "seq_a, seq_b, expected", + [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),], +) +def test_lcs(seq_a, seq_b, expected): + assert lcs(seq_a, seq_b) == expected + + +def test_modified_precision_empty(): + for k in range(1, 5): + n, d = modified_precision([[]], [], k) + assert n == 0 and d == 0 + n, d = modified_precision([[]], [0], k) + assert n == 0 and d == (k == 1) + n, d = modified_precision([[0]], [], k) + assert n == 0 and d == 0 + n, d = modified_precision([[]], list(range(k)), k) + assert n == 0 and d == 1 + n, d = modified_precision([list(range(k))], [], k) + assert n == 0 and d == 0 + + +@pytest.mark.parametrize( + "references, candidate, expected", + [ + ([[0, 0, 0], [1, 2]], [1, 2, 3, 4], ((2, 4), (1, 3), (0, 2))), + ([[0, 1, 2], [0, 0, 3]], [0, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))), + ([[0, 1, 2], [3, 0, 3]], [3, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))), + ], +) +def test_modified_precision(references, candidate, expected): + for n, (e_n, e_d) in enumerate(expected, start=1): + n, d = modified_precision(references, candidate, n) + assert n == e_n and d == e_d