From 410f0a6ca3e01477be31639c049304993a18c778 Mon Sep 17 00:00:00 2001
From: Manas-7854 <manas.agrawal@research.iiit.ac.in>
Date: Wed, 18 Mar 2026 18:30:42 +0530
Subject: [PATCH 1/2] fixed ruff errors

---
 openml/cli.py                 | 197 +++++++++++++++++++++++++++++++++-
 tests/test_openml/test_cli.py |  82 ++++++++++++--
 2 files changed, 271 insertions(+), 8 deletions(-)

diff --git a/openml/cli.py b/openml/cli.py
index 838f774d1..3e6fe9264 100644
--- a/openml/cli.py
+++ b/openml/cli.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import argparse
+import pickle
 import string
 import sys
 from collections.abc import Callable
@@ -301,6 +302,119 @@ def configure_field(  # noqa: PLR0913
     verbose_set(field, value)
 
 
+def upload_dataset(args: argparse.Namespace) -> None:
+    """Upload a dataset from a CSV or ARFF file to OpenML."""
+    import pandas as pd
+
+    file_path = Path(args.file_path)
+    if not file_path.is_file():
+        print(f"Error: File '{file_path}' not found.")
+        sys.exit(1)
+
+    suffix = file_path.suffix.lower()
+    if suffix == ".csv":
+        data = pd.read_csv(file_path)
+    elif suffix == ".arff":
+        import arff
+
+        with file_path.open() as fh:
+            arff_data = arff.load(fh)
+        data = pd.DataFrame(
+            arff_data["data"],
+            columns=[attr[0] for attr in arff_data["attributes"]],
+        )
+    else:
+        print(f"Error: Unsupported file format '{suffix}'. Supported formats: .csv, .arff")
+        sys.exit(1)
+
+    dataset = openml.datasets.create_dataset(
+        name=args.name,
+        description=args.description,
+        creator=args.creator,
+        contributor=args.contributor,
+        collection_date=args.collection_date,
+        language=args.language,
+        licence=args.licence,
+        attributes="auto",
+        data=data,
+        default_target_attribute=args.default_target_attribute,
+        ignore_attribute=args.ignore_attribute,
+        citation=args.citation or "",
+        row_id_attribute=args.row_id_attribute,
+        original_data_url=args.original_data_url,
+        paper_url=args.paper_url,
+        version_label=args.version_label,
+        update_comment=args.update_comment,
+    )
+    dataset.publish()
+    print(f"Dataset successfully uploaded. ID: {dataset.id}")
+    print(f"URL: {dataset.openml_url}")
+
+
+def upload_flow(args: argparse.Namespace) -> None:
+    """Upload a flow from a serialized model file to OpenML."""
+    from openml_sklearn import SklearnExtension
+
+    file_path = Path(args.file_path)
+    if not file_path.is_file():
+        print(f"Error: File '{file_path}' not found.")
+        sys.exit(1)
+
+    with file_path.open("rb") as fh:
+        model = pickle.load(fh)  # noqa: S301
+
+    extension = SklearnExtension()
+    flow = extension.model_to_flow(model)
+
+    if args.name:
+        flow.custom_name = args.name
+    if args.description:
+        flow.description = args.description
+
+    flow.publish()
+    print(f"Flow successfully uploaded. ID: {flow.flow_id}")
+    print(f"URL: {flow.openml_url}")
+
+
+def upload_run(args: argparse.Namespace) -> None:
+    """Upload a run from a directory containing run files to OpenML."""
+    directory = Path(args.file_path)
+    if not directory.is_dir():
+        print(f"Error: Directory '{directory}' not found.")
+        sys.exit(1)
+
+    expect_model = not args.no_model
+    run = openml.runs.OpenMLRun.from_filesystem(directory, expect_model=expect_model)
+    run.publish()
+    print(f"Run successfully uploaded. ID: {run.run_id}")
+    print(f"URL: {run.openml_url}")
+
+
+def upload(args: argparse.Namespace) -> None:
+    """Dispatch upload subcommands."""
+    if not openml.config.apikey:
+        print(
+            "Error: No API key configured. Set your API key with:\n"
+            "  openml configure apikey\n"
+            "For more information, see: "
+            "https://openml.github.io/openml-python/latest/examples/Basics/"
+            "introduction_tutorial/#authentication",
+        )
+        sys.exit(1)
+
+    upload_functions: dict[str, Callable[[argparse.Namespace], None]] = {
+        "dataset": upload_dataset,
+        "flow": upload_flow,
+        "run": upload_run,
+    }
+
+    if args.upload_resource not in upload_functions:
+        print("Please specify a resource to upload: dataset, flow, or run.")
+        sys.exit(1)
+
+    upload_functions[args.upload_resource](args)
+
+
 def configure(args: argparse.Namespace) -> None:
     """Calls the right submenu(s) to edit `args.field` in the configuration file."""
     set_functions = {
@@ -330,7 +444,10 @@ def not_supported_yet(_: str) -> None:
 
 
 def main() -> None:
-    subroutines = {"configure": configure}
+    subroutines: dict[str, Callable[[argparse.Namespace], None]] = {
+        "configure": configure,
+        "upload": upload,
+    }
 
     parser = argparse.ArgumentParser()
     # Add a global --version flag to display installed version and exit
@@ -371,6 +488,84 @@ def main() -> None:
         help="The value to set the FIELD to.",
     )
 
+    # --- upload subcommand ---
+    parser_upload = subparsers.add_parser(
+        "upload",
+        description="Upload resources (datasets, flows, or runs) to OpenML.",
+    )
+    upload_subparsers = parser_upload.add_subparsers(dest="upload_resource")
+
+    # upload dataset
+    parser_upload_dataset = upload_subparsers.add_parser(
+        "dataset",
+        description="Upload a dataset from a CSV or ARFF file.",
+    )
+    parser_upload_dataset.add_argument(
+        "file_path",
+        type=str,
+        help="Path to the dataset file (.csv or .arff).",
+    )
+    _dataset_args: list[tuple[str, str, bool]] = [
+        ("--name", "Name of the dataset.", True),
+        ("--description", "Description of the dataset.", True),
+        ("--default_target_attribute", "The default target attribute.", True),
+        ("--creator", "The person who created the dataset.", False),
+        ("--contributor", "People who contributed to the dataset.", False),
+        ("--collection_date", "The date the data was originally collected.", False),
+        ("--language", "Language in which the data is represented.", False),
+        ("--licence", "License of the data.", False),
+        ("--ignore_attribute", "Attributes to exclude in modelling (comma separated).", False),
+        ("--citation", "Reference(s) that should be cited.", False),
+        ("--row_id_attribute", "The attribute that represents the row-id column.", False),
+        ("--original_data_url", "URL to the original dataset (for derived data).", False),
+        ("--paper_url", "Link to a paper describing the dataset.", False),
+        ("--version_label", "Version label (e.g. date, hash).", False),
+        ("--update_comment", "An explanation for when the dataset is uploaded.", False),
+    ]
+    for flag, help_text, required in _dataset_args:
+        parser_upload_dataset.add_argument(
+            flag,
+            type=str,
+            required=required,
+            default=None,
+            help=help_text,
+        )
+
+    # upload flow
+    parser_upload_flow = upload_subparsers.add_parser(
+        "flow",
+        description="Upload a flow from a serialized model file (.pkl).",
+    )
+    parser_upload_flow.add_argument(
+        "file_path",
+        type=str,
+        help="Path to the serialized model file (.pkl).",
+    )
+    parser_upload_flow.add_argument("--name", type=str, default=None, help="Custom flow name.")
+    parser_upload_flow.add_argument(
+        "--description",
+        type=str,
+        default=None,
+        help="Description of the flow.",
+    )
+
+    # upload run
+    parser_upload_run = upload_subparsers.add_parser(
+        "run",
+        description="Upload a run from a directory containing run files.",
+    )
+    parser_upload_run.add_argument(
+        "file_path",
+        type=str,
+        help="Path to directory with run files (description.xml, predictions.arff, etc.).",
+    )
+    parser_upload_run.add_argument(
+        "--no_model",
+        action="store_true",
+        default=False,
+        help="If set, do not require model.pkl in the run directory.",
+    )
+
     args = parser.parse_args()
     subroutines.get(args.subroutine, lambda _: parser.print_help())(args)
 
diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py
index eb213b561..e6680ac0f 100644
--- a/tests/test_openml/test_cli.py
+++ b/tests/test_openml/test_cli.py
@@ -4,17 +4,19 @@
 import shutil
 import subprocess
 import sys
+from unittest import mock
 
-import openml
 import pytest
 
+import openml
+from openml.cli import main
+
 
 def test_cli_version_prints_package_version():
     # Invoke the CLI via module to avoid relying on console script installation
-    result = subprocess.run(
+    result = subprocess.run(  # noqa: S603
         [sys.executable, "-m", "openml.cli", "--version"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
         text=True,
         check=False,
     )
@@ -31,10 +33,9 @@ def test_console_script_version_prints_package_version():
     if console is None:
         pytest.skip("'openml' console script not found in PATH")
 
-    result = subprocess.run(
+    result = subprocess.run(  # noqa: S603
         [console, "--version"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
         text=True,
         check=False,
     )
@@ -42,3 +43,70 @@ def test_console_script_version_prints_package_version():
     assert result.returncode == 0
     assert result.stderr == ""
     assert openml.__version__ in result.stdout
+
+
+def test_upload_dataset_arg_parsing():
+    # Test that the dataset subcommand correctly parses required and optional arguments
+    test_args = [
+        "upload", "dataset", "data.csv",
+        "--name", "MyDataset",
+        "--description", "A test dataset",
+        "--default_target_attribute", "target",
+        "--creator", "TestUser",
+    ]
+    with (
+        mock.patch("sys.argv", ["openml", *test_args]),
+        mock.patch("openml.cli.upload") as mock_upload,
+    ):
+        main()
+        args = mock_upload.call_args[0][0]
+        assert args.subroutine == "upload"
+        assert args.upload_resource == "dataset"
+        assert args.file_path == "data.csv"
+        assert args.name == "MyDataset"
+        assert args.description == "A test dataset"
+        assert args.default_target_attribute == "target"
+        assert args.creator == "TestUser"
+        assert args.contributor is None
+        assert args.licence is None
+
+
+def test_upload_flow_arg_parsing():
+    # Test that the flow subcommand correctly parses positional and optional arguments
+    test_args = ["upload", "flow", "model.pkl", "--name", "MyFlow", "--description", "A flow"]
+    with (
+        mock.patch("sys.argv", ["openml", *test_args]),
+        mock.patch("openml.cli.upload") as mock_upload,
+    ):
+        main()
+        args = mock_upload.call_args[0][0]
+        assert args.upload_resource == "flow"
+        assert args.file_path == "model.pkl"
+        assert args.name == "MyFlow"
+        assert args.description == "A flow"
+
+
+def test_upload_run_arg_parsing():
+    # Test that the run subcommand correctly parses positional and flag arguments
+    test_args = ["upload", "run", "/path/to/run_dir", "--no_model"]
+    with (
+        mock.patch("sys.argv", ["openml", *test_args]),
+        mock.patch("openml.cli.upload") as mock_upload,
+    ):
+        main()
+        args = mock_upload.call_args[0][0]
+        assert args.upload_resource == "run"
+        assert args.file_path == "/path/to/run_dir"
+        assert args.no_model is True
+
+
+def test_upload_run_no_model_defaults_false():
+    # Test that the --no_model flag defaults to False if not provided
+    test_args = ["upload", "run", "/path/to/run_dir"]
+    with (
+        mock.patch("sys.argv", ["openml", *test_args]),
+        mock.patch("openml.cli.upload") as mock_upload,
+    ):
+        main()
+        args = mock_upload.call_args[0][0]
+        assert args.no_model is False

From 76bca3fff73a8f574f62a1ce5594f66c9b01cfae Mon Sep 17 00:00:00 2001
From: Manas-7854 <manas.agrawal@research.iiit.ac.in>
Date: Wed, 18 Mar 2026 19:38:25 +0530
Subject: [PATCH 2/2] added more tests and fixed minor bugs

---
 openml/cli.py                 |  17 ++++--
 tests/test_openml/test_cli.py | 112 +++++++++++++++++++++++++++++++++-
 2 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/openml/cli.py b/openml/cli.py
index 3e6fe9264..bd5f9ed2c 100644
--- a/openml/cli.py
+++ b/openml/cli.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import argparse
-import pickle
 import string
 import sys
 from collections.abc import Callable
@@ -353,17 +352,22 @@ def upload_dataset(args: argparse.Namespace) -> None:
 
 def upload_flow(args: argparse.Namespace) -> None:
     """Upload a flow from a serialized model file to OpenML."""
-    from openml_sklearn import SklearnExtension
+    import pickle
 
     file_path = Path(args.file_path)
     if not file_path.is_file():
         print(f"Error: File '{file_path}' not found.")
         sys.exit(1)
 
+    print(
+        "WARNING: Loading pickle files executes arbitrary code. "
+        "Only use this with files you trust.",
+    )
     with file_path.open("rb") as fh:
         model = pickle.load(fh)  # noqa: S301
 
-    extension = SklearnExtension()
+    extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True)
+    assert extension is not None  # guaranteed by raise_if_no_extension=True
     flow = extension.model_to_flow(model)
 
     if args.name:
@@ -508,7 +512,7 @@ def main() -> None:
     _dataset_args: list[tuple[str, str, bool]] = [
         ("--name", "Name of the dataset.", True),
         ("--description", "Description of the dataset.", True),
-        ("--default_target_attribute", "The default target attribute.", True),
+        ("--default_target_attribute", "The default target attribute.", False),
         ("--creator", "The person who created the dataset.", False),
         ("--contributor", "People who contributed to the dataset.", False),
         ("--collection_date", "The date the data was originally collected.", False),
@@ -534,12 +538,13 @@ def main() -> None:
     # upload flow
     parser_upload_flow = upload_subparsers.add_parser(
         "flow",
-        description="Upload a flow from a serialized model file (.pkl).",
+        description="Upload a flow from a serialized model file (.pkl). "
+        "WARNING: pickle files can execute arbitrary code. Only use trusted files.",
     )
     parser_upload_flow.add_argument(
         "file_path",
         type=str,
-        help="Path to the serialized model file (.pkl).",
+        help="Path to the serialized model file (.pkl). WARNING: only use trusted pickle files.",
     )
     parser_upload_flow.add_argument("--name", type=str, default=None, help="Custom flow name.")
     parser_upload_flow.add_argument(
diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py
index e6680ac0f..ce2c85752 100644
--- a/tests/test_openml/test_cli.py
+++ b/tests/test_openml/test_cli.py
@@ -9,7 +9,7 @@
 import pytest
 
 import openml
-from openml.cli import main
+from openml.cli import main, upload, upload_dataset, upload_flow, upload_run
 
 
 def test_cli_version_prints_package_version():
@@ -110,3 +110,113 @@ def test_upload_run_no_model_defaults_false():
         main()
         args = mock_upload.call_args[0][0]
         assert args.no_model is False
+
+
+def test_upload_dataset_csv(tmp_path):
+    # Verify CSV upload calls create_dataset and publish with correct arguments
+    csv_file = tmp_path / "test.csv"
+    csv_file.write_text("col_a,col_b,target\n1,2.0,cat\n3,4.0,dog\n")
+
+    args = mock.MagicMock()
+    args.file_path = str(csv_file)
+    args.name = "TestDS"
+    args.description = "desc"
+    args.default_target_attribute = "target"
+    for attr in (
+        "creator", "contributor", "collection_date", "language", "licence",
+        "ignore_attribute", "citation", "row_id_attribute",
+        "original_data_url", "paper_url", "version_label", "update_comment",
+    ):
+        setattr(args, attr, None)
+
+    mock_ds = mock.MagicMock(id=42, openml_url="https://openml.org/d/42")
+    with mock.patch("openml.datasets.create_dataset", return_value=mock_ds) as mock_create:
+        upload_dataset(args)
+        mock_create.assert_called_once()
+        assert mock_create.call_args[1]["name"] == "TestDS"
+        assert mock_create.call_args[1]["attributes"] == "auto"
+        mock_ds.publish.assert_called_once()
+
+
+def test_upload_dataset_file_not_found(capsys):
+    # Verify a clear error is shown when the file does not exist
+    args = mock.MagicMock(file_path="/nonexistent/data.csv")
+    with pytest.raises(SystemExit, match="1"):
+        upload_dataset(args)
+    assert "not found" in capsys.readouterr().out
+
+
+def test_upload_dataset_unsupported_format(tmp_path, capsys):
+    # Verify unsupported file extensions are rejected
+    bad_file = tmp_path / "data.json"
+    bad_file.write_text("{}")
+    args = mock.MagicMock(file_path=str(bad_file))
+    with pytest.raises(SystemExit, match="1"):
+        upload_dataset(args)
+    assert "Unsupported file format" in capsys.readouterr().out
+
+
+def test_upload_flow_uses_extension_api(tmp_path):
+    # Verify upload_flow uses get_extension_by_model instead of direct openml_sklearn import
+    pkl_file = tmp_path / "model.pkl"
+    pkl_file.write_bytes(b"fake")
+
+    mock_model = mock.MagicMock()
+    mock_flow = mock.MagicMock(flow_id=99, openml_url="https://openml.org/f/99")
+    mock_ext = mock.MagicMock()
+    mock_ext.model_to_flow.return_value = mock_flow
+
+    args = mock.MagicMock(file_path=str(pkl_file), name=None, description=None)
+    with (
+        mock.patch("pickle.load", return_value=mock_model),
+        mock.patch(
+            "openml.extensions.get_extension_by_model",
+            return_value=mock_ext,
+        ) as mock_get_ext,
+    ):
+        upload_flow(args)
+        mock_get_ext.assert_called_once_with(mock_model, raise_if_no_extension=True)
+        mock_flow.publish.assert_called_once()
+
+
+def test_upload_flow_file_not_found(capsys):
+    # Verify a clear error is shown when the pickle file does not exist
+    args = mock.MagicMock(file_path="/nonexistent/model.pkl")
+    with pytest.raises(SystemExit, match="1"):
+        upload_flow(args)
+    assert "not found" in capsys.readouterr().out
+
+
+def test_upload_run_calls_from_filesystem(tmp_path):
+    # Verify upload_run delegates to OpenMLRun.from_filesystem and publishes
+    run_dir = tmp_path / "run_output"
+    run_dir.mkdir()
+
+    mock_run = mock.MagicMock(run_id=55, openml_url="https://openml.org/r/55")
+    args = mock.MagicMock(file_path=str(run_dir), no_model=False)
+
+    with mock.patch.object(
+        openml.runs.OpenMLRun, "from_filesystem", return_value=mock_run,
+    ) as mock_fs:
+        upload_run(args)
+        mock_fs.assert_called_once_with(run_dir, expect_model=True)
+        mock_run.publish.assert_called_once()
+
+
+def test_upload_run_dir_not_found(capsys):
+    # Verify a clear error is shown when the run directory does not exist
+    args = mock.MagicMock(file_path="/nonexistent/run_dir", no_model=False)
+    with pytest.raises(SystemExit, match="1"):
+        upload_run(args)
+    assert "not found" in capsys.readouterr().out
+
+
+def test_upload_missing_api_key(capsys):
+    # Verify upload refuses to proceed without an API key
+    args = mock.MagicMock(upload_resource="dataset")
+    with (
+        openml.config.overwrite_config_context({"apikey": ""}),
+        pytest.raises(SystemExit, match="1"),
+    ):
+        upload(args)
+    assert "No API key configured" in capsys.readouterr().out