Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 49 additions & 24 deletions tests/test_tasks/test_clustering_task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# License: BSD 3-Clause
from __future__ import annotations
from unittest.mock import MagicMock, patch

import pytest

Expand All @@ -20,51 +19,77 @@ def setUp(self, n_levels: int = 1):
self.task_type = TaskType.CLUSTERING
self.estimation_procedure = 17

@pytest.mark.production_server()
def test_get_dataset(self):
# no clustering tasks on test server
self.use_production_server()
@patch("openml.tasks.get_task")
def test_get_dataset(self, mock_get_task):
mock_task = MagicMock()
mock_task.tid = self.task_id
mock_get_task.return_value = mock_task

task = openml.tasks.get_task(self.task_id)
task.get_dataset()

@pytest.mark.production_server()
@pytest.mark.test_server()
def test_download_task(self):
# no clustering tasks on test server
self.use_production_server()
mock_get_task.assert_called_with(self.task_id)
mock_task.get_dataset.assert_called_once()

@patch("tests.test_tasks.test_task.get_task")
def test_download_task(self, mock_get_task):
mock_task = MagicMock()
mock_task.task_id = self.task_id
mock_task.task_type_id = TaskType.CLUSTERING
mock_task.dataset_id = 36
mock_get_task.return_value = mock_task

task = super().test_download_task()
assert task.task_id == self.task_id
assert task.task_type_id == TaskType.CLUSTERING
assert task.dataset_id == 36

@pytest.mark.test_server()
def test_upload_task(self):
mock_get_task.assert_called_with(self.task_id)

@patch("openml.tasks.OpenMLTask.publish")
@patch("openml.tasks.create_task")
@patch("openml.datasets.list_datasets")
def test_upload_task(self, mock_list_datasets, mock_create_task, mock_publish):
import pandas as pd
dataset_id = 1
# Mock list_datasets to return a dataframe with at least one dataset
mock_list_datasets.return_value = pd.DataFrame({
"did": [dataset_id],
"NumberOfSymbolicFeatures": [0],
"NumberOfNumericFeatures": [10]
})

mock_task = MagicMock()
mock_task.id = 123
mock_task.publish.return_value = mock_task
mock_publish.return_value = mock_task

# Simulate: first call fails with "task already exists", second succeeds
mock_create_task.side_effect = [
OpenMLServerException(code=614, message="task already exists"),
mock_task
]

# The actual test logic inspired by the original:
compatible_datasets = self._get_compatible_rand_dataset()
for i in range(100):
try:
dataset_id = compatible_datasets[i % len(compatible_datasets)]
# Upload a clustering task without a ground truth.
task = openml.tasks.create_task(
task_type=self.task_type,
dataset_id=dataset_id,
estimation_procedure_id=self.estimation_procedure,
)
task = task.publish()
TestBase._mark_entity_for_removal("task", task.id)
TestBase.logger.info(
f"collected from {__file__.split('/')[-1]}: {task.id}",
)
# success
break
except OpenMLServerException as e:
# Error code for 'task already exists'
# Should be 533 according to the docs
# (# https://www.openml.org/api_docs#!/task/post_task)
if e.code == 614:
continue
else:
raise e
else:
raise ValueError(
f"Could not create a valid task for task type ID {self.task_type}",
)
pytest.fail("Could not create a valid task")

assert task.id == 123
assert mock_create_task.call_count == 2
89 changes: 70 additions & 19 deletions tests/test_tasks/test_task_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from __future__ import annotations

from time import time
from unittest.mock import MagicMock, patch

import pytest

import openml
from openml.testing import TestBase
import pytest


# Common methods between tasks
Expand All @@ -16,36 +18,85 @@ def setUp(self):
def tearDown(self):
super().tearDown()

@pytest.mark.test_server()
def test_tagging(self):
task = openml.tasks.get_task(1) # anneal; crossvalidation
@patch("openml.tasks.list_tasks")
@patch("openml.tasks.get_task")
def test_tagging(self, mock_get_task, mock_list_tasks):
task_id = 1
mock_task = MagicMock()
mock_task.tid = task_id
mock_get_task.return_value = mock_task

# Initial state: no tasks with the tag
mock_list_tasks.return_value = {"tid": []}

task = openml.tasks.get_task(task_id)
# tags can be at most 64 alphanumeric (+ underscore) chars
unique_indicator = str(time()).replace(".", "")
tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"

tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 0
assert len(tasks["tid"]) == 0

# After push_tag
task.push_tag(tag)
mock_list_tasks.return_value = {"tid": [task_id]}

tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 1
assert 1 in tasks["tid"]
assert len(tasks["tid"]) == 1
assert task_id in tasks["tid"]

# After remove_tag
task.remove_tag(tag)
mock_list_tasks.return_value = {"tid": []}

tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 0
assert len(tasks["tid"]) == 0

# Verify interactions
mock_get_task.assert_called_with(task_id)
mock_task.push_tag.assert_called_with(tag)
mock_task.remove_tag.assert_called_with(tag)

@patch("openml.tasks.get_task")
def test_get_train_and_test_split_indices(self, mock_get_task):
task_id = 1882
mock_task = MagicMock()
mock_task.tid = task_id
# Define expected indices for the mock
expected_train_00 = [16, 395]
expected_test_00 = [412, 364]
expected_train_22 = [237, 681]
expected_test_22 = [583, 24]

def side_effect_indices(fold, repeat, sample=0):
if repeat == 0 and fold == 0:
return (expected_train_00, expected_test_00)
if repeat == 2 and fold == 2:
return (expected_train_22, expected_test_22)
if repeat != 0 and repeat != 2:
raise ValueError(f"Repeat {repeat} not known")
if fold != 0 and fold != 2:
raise ValueError(f"Fold {fold} not known")
raise ValueError(f"Split not found for fold={fold}, repeat={repeat}")

mock_task.get_train_test_split_indices.side_effect = side_effect_indices
mock_get_task.return_value = mock_task

@pytest.mark.test_server()
def test_get_train_and_test_split_indices(self):
openml.config.set_root_cache_directory(self.static_cache_dir)
task = openml.tasks.get_task(1882)
task = openml.tasks.get_task(task_id)

train_indices, test_indices = task.get_train_test_split_indices(0, 0)
assert train_indices[0] == 16
assert train_indices[-1] == 395
assert test_indices[0] == 412
assert test_indices[-1] == 364
assert train_indices[0] == expected_train_00[0]
assert train_indices[-1] == expected_train_00[-1]
assert test_indices[0] == expected_test_00[0]
assert test_indices[-1] == expected_test_00[-1]

train_indices, test_indices = task.get_train_test_split_indices(2, 2)
assert train_indices[0] == 237
assert train_indices[-1] == 681
assert test_indices[0] == 583
assert test_indices[-1] == 24
assert train_indices[0] == expected_train_22[0]
assert train_indices[-1] == expected_train_22[-1]
assert test_indices[0] == expected_test_22[0]
assert test_indices[-1] == expected_test_22[-1]

self.assertRaisesRegex(
ValueError,
"Fold 10 not known",
Expand Down