diff options
Diffstat (limited to 'sci-libs/evaluate/files/evaluate-0.4.0-tests.patch')
-rw-r--r-- | sci-libs/evaluate/files/evaluate-0.4.0-tests.patch | 198 |
1 files changed, 175 insertions, 23 deletions
diff --git a/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch b/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch index 1e7e808576e3..452a6d862ada 100644 --- a/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch +++ b/sci-libs/evaluate/files/evaluate-0.4.0-tests.patch @@ -8,22 +8,78 @@ from datasets import ClassLabel, Dataset, Features, Sequence, Value from PIL import Image -@@ -335,6 +335,7 @@ +@@ -128,6 +128,7 @@ + return [{"text": "Lorem ipsum"} for _ in inputs] + + ++@skip("require network") + class TestEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]}) +@@ -230,6 +230,7 @@ ) + + ++@skip("require network") + class TestTextClassificationEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]}) +@@ -394,6 +394,7 @@ + self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(data), 5) + + ++@skip("require network") + class TestTextClassificationEvaluatorTwoColumns(TestCase): + def setUp(self): + self.data = Dataset.from_dict( +@@ -452,6 +452,7 @@ self.assertEqual(results["accuracy"], 1.0) -+ @skip("not working") - def test_bootstrap(self): - data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]}) -@@ -368,6 +369,7 @@ - self.assertAlmostEqual(results["samples_per_second"], len(self.data) / results["total_time_in_seconds"], 5) - self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(self.data), 5) ++@skip("require network") + class TestImageClassificationEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict( +@@ -534,6 +535,7 @@ + self.assertEqual(results["accuracy"], 0) + -+ @skip("not working") - def test_bootstrap_and_perf(self): - data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]}) ++@skip("require network") + class TestQuestionAnsweringEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict( +@@ -716,6 +716,7 @@ + ) + self.assertEqual(results["overall_accuracy"], 0.5) + ++ @skip("require network") + def test_class_init(self): + evaluator = TokenClassificationEvaluator() + self.assertEqual(evaluator.task, "token-classification") +@@ -735,6 +736,7 @@ + ) + self.assertEqual(results["overall_accuracy"], 2 / 3) ++ @skip("require network") + def test_overwrite_default_metric(self): + accuracy = load("seqeval") + results = self.evaluator.compute( +@@ -750,6 +752,7 @@ + ) + self.assertEqual(results["overall_accuracy"], 1.0) + ++ @skip("require network") + def test_data_loading(self): + # Test passing in dataset by name with data_split + data = self.evaluator.load_data("evaluate/conll2003-ci", split="validation[:1]") +@@ -863,6 +866,7 @@ + self.pipe = DummyTextGenerationPipeline(num_return_sequences=4) + self.evaluator = evaluator("text-generation") + ++ @skip("require network") + def test_class_init(self): + evaluator = TextGenerationEvaluator() + self.assertEqual(evaluator.task, "text-generation") @@ -877,6 +877,7 @@ results = self.evaluator.compute(data=self.data) self.assertIsInstance(results["unique_words"], int) @@ -32,22 +88,22 @@ def test_overwrite_default_metric(self): word_length = load("word_length") results = self.evaluator.compute( -@@ -939,6 +940,7 @@ - results = self.evaluator.compute(data=self.data) +@@ -906,6 +910,7 @@ + self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]}) + + ++@skip("require network") + class TestText2TextGenerationEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict( +@@ -979,6 +984,7 @@ self.assertEqual(results["bleu"], 0) -+ @skip("require rouge_score") - def test_overwrite_default_metric(self): - rouge = load("rouge") - results = self.evaluator.compute( -@@ -949,6 +952,7 @@ - ) - self.assertEqual(results["rouge1"], 1.0) -+ @skip("require rouge_score") - def test_summarization(self): - pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary") - e = evaluator("summarization") ++@skip("require network") + class TestAutomaticSpeechRecognitionEvaluator(TestCase): + def setUp(self): + self.data = Dataset.from_dict( --- a/tests/test_trainer_evaluator_parity.py 2023-05-14 17:50:29.224525549 +0200 +++ b/tests/test_trainer_evaluator_parity.py 2023-05-14 17:37:40.947501195 +0200 @@ -269,6 +269,7 @@ @@ -58,3 +114,99 @@ def test_token_classification_parity(self): model_name = "hf-internal-testing/tiny-bert-for-token-classification" n_samples = 500 +--- a/tests/test_load.py 2023-05-20 15:45:58.855473557 +0200 ++++ b/tests/test_load.py 2023-05-20 15:50:41.620071500 +0200 +@@ -61,6 +61,7 @@ + hf_modules_cache=self.hf_modules_cache, + ) + ++ @pytest.mark.skip("require network") + def test_HubEvaluationModuleFactory_with_internal_import(self): + # "squad_v2" requires additional imports (internal) + factory = HubEvaluationModuleFactory( +@@ -72,6 +73,7 @@ + module_factory_result = factory.get_module() + assert importlib.import_module(module_factory_result.module_path) is not None + ++ @pytest.mark.skip("require network") + def test_HubEvaluationModuleFactory_with_external_import(self): + # "bleu" requires additional imports (external from github) + factory = HubEvaluationModuleFactory( +@@ -83,6 +85,7 @@ + module_factory_result = factory.get_module() + assert importlib.import_module(module_factory_result.module_path) is not None + ++ @pytest.mark.skip("require network") + def test_HubEvaluationModuleFactoryWithScript(self): + factory = HubEvaluationModuleFactory( + SAMPLE_METRIC_IDENTIFIER, +@@ -115,6 +118,7 @@ + module_factory_result = factory.get_module() + assert importlib.import_module(module_factory_result.module_path) is not None + ++ @pytest.mark.skip("require network") + def test_cache_with_remote_canonical_module(self): + metric = "accuracy" + evaluation_module_factory( +@@ -127,6 +131,7 @@ + metric, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path + ) + ++ @pytest.mark.skip("require network") + def test_cache_with_remote_community_module(self): + metric = "lvwerra/test" + evaluation_module_factory( +--- a/tests/test_metric.py 2023-05-20 15:54:32.558477445 +0200 ++++ b/tests/test_metric.py 2023-05-20 15:55:40.775415987 +0200 +@@ -736,6 +736,7 @@ + + self.assertDictEqual(dummy_result_1, combined_evaluation.compute(predictions=preds, references=refs)) + ++ @pytest.mark.skip('require network') + def test_modules_from_string(self): + expected_result = {"accuracy": 0.5, "recall": 0.5, "precision": 1.0} + predictions = [0, 1] +--- a/tests/test_metric_common.py 2023-05-20 15:57:02.399146066 +0200 ++++ b/tests/test_metric_common.py 2023-05-20 15:59:25.167947472 +0200 +@@ -99,6 +99,7 @@ + evaluation_module_name = None + evaluation_module_type = None + ++ @pytest.mark.skip('require network') + def test_load(self, evaluation_module_name, evaluation_module_type): + doctest.ELLIPSIS_MARKER = "[...]" + evaluation_module = importlib.import_module( +--- a/tests/test_trainer_evaluator_parity.py 2023-05-20 16:00:55.986549706 +0200 ++++ b/tests/test_trainer_evaluator_parity.py 2023-05-20 16:02:51.808766855 +0200 +@@ -4,6 +4,7 @@ + import subprocess + import tempfile + import unittest ++import pytest + + import numpy as np + import torch +@@ -33,6 +33,7 @@ + def tearDown(self): + shutil.rmtree(self.dir_path, ignore_errors=True) + ++ @pytest.mark.skip('require network') + def test_text_classification_parity(self): + model_name = "philschmid/tiny-bert-sst2-distilled" + +@@ -121,6 +122,7 @@ + + self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) + ++ @pytest.mark.skip('require network') + def test_image_classification_parity(self): + # we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images + model_name = "douwekiela/resnet-18-finetuned-dogfood" +@@ -179,6 +181,7 @@ + + self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) + ++ @pytest.mark.skip('require network') + def test_question_answering_parity(self): + model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad" + model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2" |