summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlfredo Tupone <tupone@gentoo.org>2023-05-20 16:09:13 +0200
committerAlfredo Tupone <tupone@gentoo.org>2023-05-20 16:10:25 +0200
commitbde90d926e277b7ea59c487e38559c3a6813edd6 (patch)
tree4c8de7f442ce42f78f64e6eecb834e0b9502e3ca /sci-libs/evaluate
parentsys-apps/inxi: add 3.3.27.1 (diff)
downloadgentoo-bde90d926e277b7ea59c487e38559c3a6813edd6.tar.gz
gentoo-bde90d926e277b7ea59c487e38559c3a6813edd6.tar.bz2
gentoo-bde90d926e277b7ea59c487e38559c3a6813edd6.zip
sci-libs/evaluate: skip tests that require network access
Closes: https://bugs.gentoo.org/906681 Signed-off-by: Alfredo Tupone <tupone@gentoo.org>
Diffstat (limited to 'sci-libs/evaluate')
-rw-r--r--sci-libs/evaluate/evaluate-0.4.0-r2.ebuild1
-rw-r--r--sci-libs/evaluate/files/evaluate-0.4.0-tests.patch198
2 files changed, 176 insertions, 23 deletions
diff --git a/sci-libs/evaluate/evaluate-0.4.0-r2.ebuild b/sci-libs/evaluate/evaluate-0.4.0-r2.ebuild
index 11f7e2a1f9c2..61fc96cebeac 100644
--- a/sci-libs/evaluate/evaluate-0.4.0-r2.ebuild
+++ b/sci-libs/evaluate/evaluate-0.4.0-r2.ebuild
@@ -47,5 +47,6 @@ src_prepare() {
rm -r metrics/{nist_mt,rl_reliability,rouge,sacrebleu,sari} || die
rm -r metrics/{ter,trec_eval,wiki_split,xtreme_s} || die
rm -r measurements/word_length || die
+ rm tests/test_evaluation_suite.py || die
distutils-r1_src_prepare
}
diff --git a/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch b/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch
index 1e7e808576e3..452a6d862ada 100644
--- a/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch
+++ b/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch
@@ -8,22 +8,78 @@
from datasets import ClassLabel, Dataset, Features, Sequence, Value
from PIL import Image
-@@ -335,6 +335,7 @@
+@@ -128,6 +128,7 @@
+ return [{"text": "Lorem ipsum"} for _ in inputs]
+
+
++@skip("require network")
+ class TestEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
+@@ -230,6 +230,7 @@
)
+
+
++@skip("require network")
+ class TestTextClassificationEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
+@@ -394,6 +394,7 @@
+ self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(data), 5)
+
+
++@skip("require network")
+ class TestTextClassificationEvaluatorTwoColumns(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict(
+@@ -452,6 +452,7 @@
self.assertEqual(results["accuracy"], 1.0)
-+ @skip("not working")
- def test_bootstrap(self):
- data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]})
-@@ -368,6 +369,7 @@
- self.assertAlmostEqual(results["samples_per_second"], len(self.data) / results["total_time_in_seconds"], 5)
- self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(self.data), 5)
++@skip("require network")
+ class TestImageClassificationEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict(
+@@ -534,6 +535,7 @@
+ self.assertEqual(results["accuracy"], 0)
+
-+ @skip("not working")
- def test_bootstrap_and_perf(self):
- data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]})
++@skip("require network")
+ class TestQuestionAnsweringEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict(
+@@ -716,6 +716,7 @@
+ )
+ self.assertEqual(results["overall_accuracy"], 0.5)
+
++ @skip("require network")
+ def test_class_init(self):
+ evaluator = TokenClassificationEvaluator()
+ self.assertEqual(evaluator.task, "token-classification")
+@@ -735,6 +736,7 @@
+ )
+ self.assertEqual(results["overall_accuracy"], 2 / 3)
++ @skip("require network")
+ def test_overwrite_default_metric(self):
+ accuracy = load("seqeval")
+ results = self.evaluator.compute(
+@@ -750,6 +752,7 @@
+ )
+ self.assertEqual(results["overall_accuracy"], 1.0)
+
++ @skip("require network")
+ def test_data_loading(self):
+ # Test passing in dataset by name with data_split
+ data = self.evaluator.load_data("evaluate/conll2003-ci", split="validation[:1]")
+@@ -863,6 +866,7 @@
+ self.pipe = DummyTextGenerationPipeline(num_return_sequences=4)
+ self.evaluator = evaluator("text-generation")
+
++ @skip("require network")
+ def test_class_init(self):
+ evaluator = TextGenerationEvaluator()
+ self.assertEqual(evaluator.task, "text-generation")
@@ -877,6 +877,7 @@
results = self.evaluator.compute(data=self.data)
self.assertIsInstance(results["unique_words"], int)
@@ -32,22 +88,22 @@
def test_overwrite_default_metric(self):
word_length = load("word_length")
results = self.evaluator.compute(
-@@ -939,6 +940,7 @@
- results = self.evaluator.compute(data=self.data)
+@@ -906,6 +910,7 @@
+ self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]})
+
+
++@skip("require network")
+ class TestText2TextGenerationEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict(
+@@ -979,6 +984,7 @@
self.assertEqual(results["bleu"], 0)
-+ @skip("require rouge_score")
- def test_overwrite_default_metric(self):
- rouge = load("rouge")
- results = self.evaluator.compute(
-@@ -949,6 +952,7 @@
- )
- self.assertEqual(results["rouge1"], 1.0)
-+ @skip("require rouge_score")
- def test_summarization(self):
- pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary")
- e = evaluator("summarization")
++@skip("require network")
+ class TestAutomaticSpeechRecognitionEvaluator(TestCase):
+ def setUp(self):
+ self.data = Dataset.from_dict(
--- a/tests/test_trainer_evaluator_parity.py 2023-05-14 17:50:29.224525549 +0200
+++ b/tests/test_trainer_evaluator_parity.py 2023-05-14 17:37:40.947501195 +0200
@@ -269,6 +269,7 @@
@@ -58,3 +114,99 @@
def test_token_classification_parity(self):
model_name = "hf-internal-testing/tiny-bert-for-token-classification"
n_samples = 500
+--- a/tests/test_load.py 2023-05-20 15:45:58.855473557 +0200
++++ b/tests/test_load.py 2023-05-20 15:50:41.620071500 +0200
+@@ -61,6 +61,7 @@
+ hf_modules_cache=self.hf_modules_cache,
+ )
+
++ @pytest.mark.skip("require network")
+ def test_HubEvaluationModuleFactory_with_internal_import(self):
+ # "squad_v2" requires additional imports (internal)
+ factory = HubEvaluationModuleFactory(
+@@ -72,6 +73,7 @@
+ module_factory_result = factory.get_module()
+ assert importlib.import_module(module_factory_result.module_path) is not None
+
++ @pytest.mark.skip("require network")
+ def test_HubEvaluationModuleFactory_with_external_import(self):
+ # "bleu" requires additional imports (external from github)
+ factory = HubEvaluationModuleFactory(
+@@ -83,6 +85,7 @@
+ module_factory_result = factory.get_module()
+ assert importlib.import_module(module_factory_result.module_path) is not None
+
++ @pytest.mark.skip("require network")
+ def test_HubEvaluationModuleFactoryWithScript(self):
+ factory = HubEvaluationModuleFactory(
+ SAMPLE_METRIC_IDENTIFIER,
+@@ -115,6 +118,7 @@
+ module_factory_result = factory.get_module()
+ assert importlib.import_module(module_factory_result.module_path) is not None
+
++ @pytest.mark.skip("require network")
+ def test_cache_with_remote_canonical_module(self):
+ metric = "accuracy"
+ evaluation_module_factory(
+@@ -127,6 +131,7 @@
+ metric, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
+ )
+
++ @pytest.mark.skip("require network")
+ def test_cache_with_remote_community_module(self):
+ metric = "lvwerra/test"
+ evaluation_module_factory(
+--- a/tests/test_metric.py 2023-05-20 15:54:32.558477445 +0200
++++ b/tests/test_metric.py 2023-05-20 15:55:40.775415987 +0200
+@@ -736,6 +736,7 @@
+
+ self.assertDictEqual(dummy_result_1, combined_evaluation.compute(predictions=preds, references=refs))
+
++ @pytest.mark.skip('require network')
+ def test_modules_from_string(self):
+ expected_result = {"accuracy": 0.5, "recall": 0.5, "precision": 1.0}
+ predictions = [0, 1]
+--- a/tests/test_metric_common.py 2023-05-20 15:57:02.399146066 +0200
++++ b/tests/test_metric_common.py 2023-05-20 15:59:25.167947472 +0200
+@@ -99,6 +99,7 @@
+ evaluation_module_name = None
+ evaluation_module_type = None
+
++ @pytest.mark.skip('require network')
+ def test_load(self, evaluation_module_name, evaluation_module_type):
+ doctest.ELLIPSIS_MARKER = "[...]"
+ evaluation_module = importlib.import_module(
+--- a/tests/test_trainer_evaluator_parity.py 2023-05-20 16:00:55.986549706 +0200
++++ b/tests/test_trainer_evaluator_parity.py 2023-05-20 16:02:51.808766855 +0200
+@@ -4,6 +4,7 @@
+ import subprocess
+ import tempfile
+ import unittest
++import pytest
+
+ import numpy as np
+ import torch
+@@ -33,6 +33,7 @@
+ def tearDown(self):
+ shutil.rmtree(self.dir_path, ignore_errors=True)
+
++ @pytest.mark.skip('require network')
+ def test_text_classification_parity(self):
+ model_name = "philschmid/tiny-bert-sst2-distilled"
+
+@@ -121,6 +122,7 @@
+
+ self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
+
++ @pytest.mark.skip('require network')
+ def test_image_classification_parity(self):
+ # we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
+ model_name = "douwekiela/resnet-18-finetuned-dogfood"
+@@ -179,6 +181,7 @@
+
+ self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
+
++ @pytest.mark.skip('require network')
+ def test_question_answering_parity(self):
+ model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad"
+ model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2"