From 410f0a6ca3e01477be31639c049304993a18c778 Mon Sep 17 00:00:00 2001 From: Manas-7854 Date: Wed, 18 Mar 2026 18:30:42 +0530 Subject: [PATCH 1/2] fixed ruff errors --- openml/cli.py | 197 +++++++++++++++++++++++++++++++++- tests/test_openml/test_cli.py | 82 ++++++++++++-- 2 files changed, 271 insertions(+), 8 deletions(-) diff --git a/openml/cli.py b/openml/cli.py index 838f774d1..3e6fe9264 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse +import pickle import string import sys from collections.abc import Callable @@ -301,6 +302,119 @@ def configure_field( # noqa: PLR0913 verbose_set(field, value) +def upload_dataset(args: argparse.Namespace) -> None: + """Upload a dataset from a CSV or ARFF file to OpenML.""" + import pandas as pd + + file_path = Path(args.file_path) + if not file_path.is_file(): + print(f"Error: File '{file_path}' not found.") + sys.exit(1) + + suffix = file_path.suffix.lower() + if suffix == ".csv": + data = pd.read_csv(file_path) + elif suffix == ".arff": + import arff + + with file_path.open() as fh: + arff_data = arff.load(fh) + data = pd.DataFrame( + arff_data["data"], + columns=[attr[0] for attr in arff_data["attributes"]], + ) + else: + print(f"Error: Unsupported file format '{suffix}'. Supported formats: .csv, .arff") + sys.exit(1) + + dataset = openml.datasets.create_dataset( + name=args.name, + description=args.description, + creator=args.creator, + contributor=args.contributor, + collection_date=args.collection_date, + language=args.language, + licence=args.licence, + attributes="auto", + data=data, + default_target_attribute=args.default_target_attribute, + ignore_attribute=args.ignore_attribute, + citation=args.citation or "", + row_id_attribute=args.row_id_attribute, + original_data_url=args.original_data_url, + paper_url=args.paper_url, + version_label=args.version_label, + update_comment=args.update_comment, + ) + dataset.publish() + print(f"Dataset successfully uploaded. ID: {dataset.id}") + print(f"URL: {dataset.openml_url}") + + +def upload_flow(args: argparse.Namespace) -> None: + """Upload a flow from a serialized model file to OpenML.""" + from openml_sklearn import SklearnExtension + + file_path = Path(args.file_path) + if not file_path.is_file(): + print(f"Error: File '{file_path}' not found.") + sys.exit(1) + + with file_path.open("rb") as fh: + model = pickle.load(fh) # noqa: S301 + + extension = SklearnExtension() + flow = extension.model_to_flow(model) + + if args.name: + flow.custom_name = args.name + if args.description: + flow.description = args.description + + flow.publish() + print(f"Flow successfully uploaded. ID: {flow.flow_id}") + print(f"URL: {flow.openml_url}") + + +def upload_run(args: argparse.Namespace) -> None: + """Upload a run from a directory containing run files to OpenML.""" + directory = Path(args.file_path) + if not directory.is_dir(): + print(f"Error: Directory '{directory}' not found.") + sys.exit(1) + + expect_model = not args.no_model + run = openml.runs.OpenMLRun.from_filesystem(directory, expect_model=expect_model) + run.publish() + print(f"Run successfully uploaded. ID: {run.run_id}") + print(f"URL: {run.openml_url}") + + +def upload(args: argparse.Namespace) -> None: + """Dispatch upload subcommands.""" + if not openml.config.apikey: + print( + "Error: No API key configured. Set your API key with:\n" + " openml configure apikey\n" + "For more information, see: " + "https://openml.github.io/openml-python/latest/examples/Basics/" + "introduction_tutorial/#authentication", + ) + sys.exit(1) + + upload_functions: dict[str, Callable[[argparse.Namespace], None]] = { + "dataset": upload_dataset, + "flow": upload_flow, + "run": upload_run, + } + + if args.upload_resource not in upload_functions: + print("Please specify a resource to upload: dataset, flow, or run.") + sys.exit(1) + + upload_functions[args.upload_resource](args) + + def configure(args: argparse.Namespace) -> None: """Calls the right submenu(s) to edit `args.field` in the configuration file.""" set_functions = { @@ -330,7 +444,10 @@ def not_supported_yet(_: str) -> None: def main() -> None: - subroutines = {"configure": configure} + subroutines: dict[str, Callable[[argparse.Namespace], None]] = { + "configure": configure, + "upload": upload, + } parser = argparse.ArgumentParser() # Add a global --version flag to display installed version and exit @@ -371,6 +488,84 @@ def main() -> None: help="The value to set the FIELD to.", ) + # --- upload subcommand --- + parser_upload = subparsers.add_parser( + "upload", + description="Upload resources (datasets, flows, or runs) to OpenML.", + ) + upload_subparsers = parser_upload.add_subparsers(dest="upload_resource") + + # upload dataset + parser_upload_dataset = upload_subparsers.add_parser( + "dataset", + description="Upload a dataset from a CSV or ARFF file.", + ) + parser_upload_dataset.add_argument( + "file_path", + type=str, + help="Path to the dataset file (.csv or .arff).", + ) + _dataset_args: list[tuple[str, str, bool]] = [ + ("--name", "Name of the dataset.", True), + ("--description", "Description of the dataset.", True), + ("--default_target_attribute", "The default target attribute.", True), + ("--creator", "The person who created the dataset.", False), + ("--contributor", "People who contributed to the dataset.", False), + ("--collection_date", "The date the data was originally collected.", False), + ("--language", "Language in which the data is represented.", False), + ("--licence", "License of the data.", False), + ("--ignore_attribute", "Attributes to exclude in modelling (comma separated).", False), + ("--citation", "Reference(s) that should be cited.", False), + ("--row_id_attribute", "The attribute that represents the row-id column.", False), + ("--original_data_url", "URL to the original dataset (for derived data).", False), + ("--paper_url", "Link to a paper describing the dataset.", False), + ("--version_label", "Version label (e.g. date, hash).", False), + ("--update_comment", "An explanation for when the dataset is uploaded.", False), + ] + for flag, help_text, required in _dataset_args: + parser_upload_dataset.add_argument( + flag, + type=str, + required=required, + default=None, + help=help_text, + ) + + # upload flow + parser_upload_flow = upload_subparsers.add_parser( + "flow", + description="Upload a flow from a serialized model file (.pkl).", + ) + parser_upload_flow.add_argument( + "file_path", + type=str, + help="Path to the serialized model file (.pkl).", + ) + parser_upload_flow.add_argument("--name", type=str, default=None, help="Custom flow name.") + parser_upload_flow.add_argument( + "--description", + type=str, + default=None, + help="Description of the flow.", + ) + + # upload run + parser_upload_run = upload_subparsers.add_parser( + "run", + description="Upload a run from a directory containing run files.", + ) + parser_upload_run.add_argument( + "file_path", + type=str, + help="Path to directory with run files (description.xml, predictions.arff, etc.).", + ) + parser_upload_run.add_argument( + "--no_model", + action="store_true", + default=False, + help="If set, do not require model.pkl in the run directory.", + ) + args = parser.parse_args() subroutines.get(args.subroutine, lambda _: parser.print_help())(args) diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py index eb213b561..e6680ac0f 100644 --- a/tests/test_openml/test_cli.py +++ b/tests/test_openml/test_cli.py @@ -4,17 +4,19 @@ import shutil import subprocess import sys +from unittest import mock -import openml import pytest +import openml +from openml.cli import main + def test_cli_version_prints_package_version(): # Invoke the CLI via module to avoid relying on console script installation - result = subprocess.run( + result = subprocess.run( # noqa: S603 [sys.executable, "-m", "openml.cli", "--version"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, text=True, check=False, ) @@ -31,10 +33,9 @@ def test_console_script_version_prints_package_version(): if console is None: pytest.skip("'openml' console script not found in PATH") - result = subprocess.run( + result = subprocess.run( # noqa: S603 [console, "--version"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, text=True, check=False, ) @@ -42,3 +43,70 @@ def test_console_script_version_prints_package_version(): assert result.returncode == 0 assert result.stderr == "" assert openml.__version__ in result.stdout + + +def test_upload_dataset_arg_parsing(): + # Test that the dataset subcommand correctly parses required and optional arguments + test_args = [ + "upload", "dataset", "data.csv", + "--name", "MyDataset", + "--description", "A test dataset", + "--default_target_attribute", "target", + "--creator", "TestUser", + ] + with ( + mock.patch("sys.argv", ["openml", *test_args]), + mock.patch("openml.cli.upload") as mock_upload, + ): + main() + args = mock_upload.call_args[0][0] + assert args.subroutine == "upload" + assert args.upload_resource == "dataset" + assert args.file_path == "data.csv" + assert args.name == "MyDataset" + assert args.description == "A test dataset" + assert args.default_target_attribute == "target" + assert args.creator == "TestUser" + assert args.contributor is None + assert args.licence is None + + +def test_upload_flow_arg_parsing(): + # Test that the flow subcommand correctly parses positional and optional arguments + test_args = ["upload", "flow", "model.pkl", "--name", "MyFlow", "--description", "A flow"] + with ( + mock.patch("sys.argv", ["openml", *test_args]), + mock.patch("openml.cli.upload") as mock_upload, + ): + main() + args = mock_upload.call_args[0][0] + assert args.upload_resource == "flow" + assert args.file_path == "model.pkl" + assert args.name == "MyFlow" + assert args.description == "A flow" + + +def test_upload_run_arg_parsing(): + # Test that the run subcommand correctly parses positional and flag arguments + test_args = ["upload", "run", "/path/to/run_dir", "--no_model"] + with ( + mock.patch("sys.argv", ["openml", *test_args]), + mock.patch("openml.cli.upload") as mock_upload, + ): + main() + args = mock_upload.call_args[0][0] + assert args.upload_resource == "run" + assert args.file_path == "/path/to/run_dir" + assert args.no_model is True + + +def test_upload_run_no_model_defaults_false(): + # Test that the --no_model flag defaults to False if not provided + test_args = ["upload", "run", "/path/to/run_dir"] + with ( + mock.patch("sys.argv", ["openml", *test_args]), + mock.patch("openml.cli.upload") as mock_upload, + ): + main() + args = mock_upload.call_args[0][0] + assert args.no_model is False From 76bca3fff73a8f574f62a1ce5594f66c9b01cfae Mon Sep 17 00:00:00 2001 From: Manas-7854 Date: Wed, 18 Mar 2026 19:38:25 +0530 Subject: [PATCH 2/2] added more tests and fixed minor bugs --- openml/cli.py | 17 ++++-- tests/test_openml/test_cli.py | 112 +++++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 7 deletions(-) diff --git a/openml/cli.py b/openml/cli.py index 3e6fe9264..bd5f9ed2c 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -3,7 +3,6 @@ from __future__ import annotations import argparse -import pickle import string import sys from collections.abc import Callable @@ -353,17 +352,22 @@ def upload_dataset(args: argparse.Namespace) -> None: def upload_flow(args: argparse.Namespace) -> None: """Upload a flow from a serialized model file to OpenML.""" - from openml_sklearn import SklearnExtension + import pickle file_path = Path(args.file_path) if not file_path.is_file(): print(f"Error: File '{file_path}' not found.") sys.exit(1) + print( + "WARNING: Loading pickle files executes arbitrary code. " + "Only use this with files you trust.", + ) with file_path.open("rb") as fh: model = pickle.load(fh) # noqa: S301 - extension = SklearnExtension() + extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True) + assert extension is not None # guaranteed by raise_if_no_extension=True flow = extension.model_to_flow(model) if args.name: @@ -508,7 +512,7 @@ def main() -> None: _dataset_args: list[tuple[str, str, bool]] = [ ("--name", "Name of the dataset.", True), ("--description", "Description of the dataset.", True), - ("--default_target_attribute", "The default target attribute.", True), + ("--default_target_attribute", "The default target attribute.", False), ("--creator", "The person who created the dataset.", False), ("--contributor", "People who contributed to the dataset.", False), ("--collection_date", "The date the data was originally collected.", False), @@ -534,12 +538,13 @@ def main() -> None: # upload flow parser_upload_flow = upload_subparsers.add_parser( "flow", - description="Upload a flow from a serialized model file (.pkl).", + description="Upload a flow from a serialized model file (.pkl). " + "WARNING: pickle files can execute arbitrary code. Only use trusted files.", ) parser_upload_flow.add_argument( "file_path", type=str, - help="Path to the serialized model file (.pkl).", + help="Path to the serialized model file (.pkl). WARNING: only use trusted pickle files.", ) parser_upload_flow.add_argument("--name", type=str, default=None, help="Custom flow name.") parser_upload_flow.add_argument( diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py index e6680ac0f..ce2c85752 100644 --- a/tests/test_openml/test_cli.py +++ b/tests/test_openml/test_cli.py @@ -9,7 +9,7 @@ import pytest import openml -from openml.cli import main +from openml.cli import main, upload, upload_dataset, upload_flow, upload_run def test_cli_version_prints_package_version(): @@ -110,3 +110,113 @@ def test_upload_run_no_model_defaults_false(): main() args = mock_upload.call_args[0][0] assert args.no_model is False + + +def test_upload_dataset_csv(tmp_path): + # Verify CSV upload calls create_dataset and publish with correct arguments + csv_file = tmp_path / "test.csv" + csv_file.write_text("col_a,col_b,target\n1,2.0,cat\n3,4.0,dog\n") + + args = mock.MagicMock() + args.file_path = str(csv_file) + args.name = "TestDS" + args.description = "desc" + args.default_target_attribute = "target" + for attr in ( + "creator", "contributor", "collection_date", "language", "licence", + "ignore_attribute", "citation", "row_id_attribute", + "original_data_url", "paper_url", "version_label", "update_comment", + ): + setattr(args, attr, None) + + mock_ds = mock.MagicMock(id=42, openml_url="https://openml.org/d/42") + with mock.patch("openml.datasets.create_dataset", return_value=mock_ds) as mock_create: + upload_dataset(args) + mock_create.assert_called_once() + assert mock_create.call_args[1]["name"] == "TestDS" + assert mock_create.call_args[1]["attributes"] == "auto" + mock_ds.publish.assert_called_once() + + +def test_upload_dataset_file_not_found(capsys): + # Verify a clear error is shown when the file does not exist + args = mock.MagicMock(file_path="/nonexistent/data.csv") + with pytest.raises(SystemExit, match="1"): + upload_dataset(args) + assert "not found" in capsys.readouterr().out + + +def test_upload_dataset_unsupported_format(tmp_path, capsys): + # Verify unsupported file extensions are rejected + bad_file = tmp_path / "data.json" + bad_file.write_text("{}") + args = mock.MagicMock(file_path=str(bad_file)) + with pytest.raises(SystemExit, match="1"): + upload_dataset(args) + assert "Unsupported file format" in capsys.readouterr().out + + +def test_upload_flow_uses_extension_api(tmp_path): + # Verify upload_flow uses get_extension_by_model instead of direct openml_sklearn import + pkl_file = tmp_path / "model.pkl" + pkl_file.write_bytes(b"fake") + + mock_model = mock.MagicMock() + mock_flow = mock.MagicMock(flow_id=99, openml_url="https://openml.org/f/99") + mock_ext = mock.MagicMock() + mock_ext.model_to_flow.return_value = mock_flow + + args = mock.MagicMock(file_path=str(pkl_file), name=None, description=None) + with ( + mock.patch("pickle.load", return_value=mock_model), + mock.patch( + "openml.extensions.get_extension_by_model", + return_value=mock_ext, + ) as mock_get_ext, + ): + upload_flow(args) + mock_get_ext.assert_called_once_with(mock_model, raise_if_no_extension=True) + mock_flow.publish.assert_called_once() + + +def test_upload_flow_file_not_found(capsys): + # Verify a clear error is shown when the pickle file does not exist + args = mock.MagicMock(file_path="/nonexistent/model.pkl") + with pytest.raises(SystemExit, match="1"): + upload_flow(args) + assert "not found" in capsys.readouterr().out + + +def test_upload_run_calls_from_filesystem(tmp_path): + # Verify upload_run delegates to OpenMLRun.from_filesystem and publishes + run_dir = tmp_path / "run_output" + run_dir.mkdir() + + mock_run = mock.MagicMock(run_id=55, openml_url="https://openml.org/r/55") + args = mock.MagicMock(file_path=str(run_dir), no_model=False) + + with mock.patch.object( + openml.runs.OpenMLRun, "from_filesystem", return_value=mock_run, + ) as mock_fs: + upload_run(args) + mock_fs.assert_called_once_with(run_dir, expect_model=True) + mock_run.publish.assert_called_once() + + +def test_upload_run_dir_not_found(capsys): + # Verify a clear error is shown when the run directory does not exist + args = mock.MagicMock(file_path="/nonexistent/run_dir", no_model=False) + with pytest.raises(SystemExit, match="1"): + upload_run(args) + assert "not found" in capsys.readouterr().out + + +def test_upload_missing_api_key(capsys): + # Verify upload refuses to proceed without an API key + args = mock.MagicMock(upload_resource="dataset") + with ( + openml.config.overwrite_config_context({"apikey": ""}), + pytest.raises(SystemExit, match="1"), + ): + upload(args) + assert "No API key configured" in capsys.readouterr().out