From 3716af368b9f4761bdb5f7d8792c736005572196 Mon Sep 17 00:00:00 2001 From: rachit23tech Date: Wed, 18 Mar 2026 19:55:26 +0530 Subject: [PATCH 1/2] Fix: Error Handling & Validation Improvements --- openml/datasets/data_feature.py | 97 +++++++++++++++++++++++++++------ openml/datasets/dataset.py | 10 +++- 2 files changed, 89 insertions(+), 18 deletions(-) diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index 0598763b0..f39824961 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -15,19 +15,28 @@ class OpenMLDataFeature: # noqa: PLW1641 Parameters ---------- index : int - The index of this feature + The index of this feature. Must be non-negative. name : str - Name of the feature + Name of the feature. Must be a non-empty string. data_type : str can be nominal, numeric, string, date (corresponds to arff) nominal_values : list(str) - list of the possible values, in case of nominal attribute + list of the possible values, in case of nominal attribute. + Must be a non-empty list for nominal data types. number_missing_values : int Number of rows that have a missing value for this feature. - ontologies : list(str) + Must be non-negative. + ontologies : list(str), optional list of ontologies attached to this feature. An ontology describes the concept that are described in a feature. An ontology is defined by an URL where the information is provided. + + Raises + ------ + TypeError + If types are incorrect for any parameter. + ValueError + If values are invalid (e.g., negative counts, empty name, invalid data_type). """ LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"] @@ -41,38 +50,94 @@ def __init__( # noqa: PLR0913 number_missing_values: int, ontologies: list[str] | None = None, ): + # Validate index if not isinstance(index, int): - raise TypeError(f"Index must be `int` but is {type(index)}") + raise TypeError( + f"Parameter 'index' must be int, but got {type(index).__name__}. Value: {index!r}" + ) + if index < 0: + raise ValueError( + f"Parameter 'index' must be non-negative, but got {index}. " + "Feature indices cannot be negative." + ) + # Validate name + if not isinstance(name, str): + raise TypeError( + f"Parameter 'name' must be str, but got {type(name).__name__}. Value: {name!r}" + ) + if not name.strip(): + raise ValueError(f"Parameter 'name' cannot be empty or whitespace-only. Got: {name!r}") + + # Validate data_type + if not isinstance(data_type, str): + raise TypeError( + f"Parameter 'data_type' must be str, but got {type(data_type).__name__}. " + f"Value: {data_type!r}" + ) if data_type not in self.LEGAL_DATA_TYPES: raise ValueError( - f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}", + f"Parameter 'data_type' must be one of {list(self.LEGAL_DATA_TYPES)}, " + f"but got {data_type!r}." ) + # Validate nominal_values if data_type == "nominal": if nominal_values is None: raise TypeError( - "Dataset features require attribute `nominal_values` for nominal feature type.", + "Parameter 'nominal_values' is required for nominal data types, but got None. " + "Please provide a list of nominal values." ) - if not isinstance(nominal_values, list): raise TypeError( - "Argument `nominal_values` is of wrong datatype, should be list, " - f"but is {type(nominal_values)}", + f"Parameter 'nominal_values' must be list, but got {type(nominal_values).__name__}. " + f"Value: {nominal_values!r}" + ) + if not nominal_values: + raise ValueError( + "Parameter 'nominal_values' cannot be empty for nominal data types. " + "Please provide at least one nominal value." + ) + # Validate that all elements are strings + non_string_values = [v for v in nominal_values if not isinstance(v, str)] + if non_string_values: + raise TypeError( + f"All elements in 'nominal_values' must be str, but found non-string values: " + f"{non_string_values}. Expected all strings in list." ) elif nominal_values is not None: - raise TypeError("Argument `nominal_values` must be None for non-nominal feature.") + raise TypeError( + f"Parameter 'nominal_values' must be None for non-nominal data types (got {data_type!r}), " + f"but got {type(nominal_values).__name__}. Value: {nominal_values!r}" + ) + # Validate number_missing_values if not isinstance(number_missing_values, int): - msg = f"number_missing_values must be int but is {type(number_missing_values)}" - raise TypeError(msg) + raise TypeError( + f"Parameter 'number_missing_values' must be int, but got " + f"{type(number_missing_values).__name__}. Value: {number_missing_values!r}" + ) + if number_missing_values < 0: + raise ValueError( + f"Parameter 'number_missing_values' must be non-negative, but got " + f"{number_missing_values}. Cannot have negative missing values." + ) + + # Validate ontologies (keep simple - just check if list or None) + if ontologies is not None: + if not isinstance(ontologies, list): + raise TypeError( + f"Parameter 'ontologies' must be list or None, but got {type(ontologies).__name__}. " + f"Value: {ontologies!r}" + ) + # All validations passed, assign attributes self.index = index - self.name = str(name) - self.data_type = str(data_type) + self.name = name + self.data_type = data_type self.nominal_values = nominal_values self.number_missing_values = number_missing_values - self.ontologies = ontologies + self.ontologies = ontologies if ontologies is not None else [] def __repr__(self) -> str: return f"[{self.index} - {self.name} ({self.data_type})]" diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 59d6205ba..e0be4bab3 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1022,14 +1022,20 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature features: dict[int, OpenMLDataFeature] = {} for idx, xmlfeature in enumerate(features_xml["oml:feature"]): - nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) + nr_missing = xmlfeature.get("oml:number_of_missing_values", "0") + + # FIX: Convert ontology string to list for consistency + ontologies = xmlfeature.get("oml:ontology") + if isinstance(ontologies, str): + ontologies = [ontologies] + feature = OpenMLDataFeature( int(xmlfeature["oml:index"]), xmlfeature["oml:name"], xmlfeature["oml:data_type"], xmlfeature.get("oml:nominal_value"), int(nr_missing), - xmlfeature.get("oml:ontology"), + ontologies, # Now it's always list or None ) if idx != feature.index: raise ValueError("Data features not provided in right order") From 7d8db2389f3208c88fd2cdc1150cb42dd8574556 Mon Sep 17 00:00:00 2001 From: rachit23tech Date: Wed, 18 Mar 2026 20:59:34 +0530 Subject: [PATCH 2/2] Refactor: reduce __init__ complexity and fix ruff issues --- openml/datasets/data_feature.py | 43 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index f39824961..dd60b9393 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -41,7 +41,7 @@ class OpenMLDataFeature: # noqa: PLW1641 LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"] - def __init__( # noqa: PLR0913 + def __init__( # noqa: PLR0913, C901, PLR0912 self, index: int, name: str, @@ -72,8 +72,8 @@ def __init__( # noqa: PLR0913 # Validate data_type if not isinstance(data_type, str): raise TypeError( - f"Parameter 'data_type' must be str, but got {type(data_type).__name__}. " - f"Value: {data_type!r}" + f"Parameter 'data_type' must be str, but got " + f"{type(data_type).__name__}. Value: {data_type!r}" ) if data_type not in self.LEGAL_DATA_TYPES: raise ValueError( @@ -85,37 +85,39 @@ def __init__( # noqa: PLR0913 if data_type == "nominal": if nominal_values is None: raise TypeError( - "Parameter 'nominal_values' is required for nominal data types, but got None. " - "Please provide a list of nominal values." + "Parameter 'nominal_values' is required for nominal data types, " + "but got None. Please provide a list of nominal values." ) if not isinstance(nominal_values, list): raise TypeError( - f"Parameter 'nominal_values' must be list, but got {type(nominal_values).__name__}. " - f"Value: {nominal_values!r}" + f"Parameter 'nominal_values' must be list, but got " + f"{type(nominal_values).__name__}. Value: {nominal_values!r}" ) if not nominal_values: raise ValueError( - "Parameter 'nominal_values' cannot be empty for nominal data types. " - "Please provide at least one nominal value." + "Parameter 'nominal_values' cannot be empty for nominal data " + "types. Please provide at least one nominal value." ) # Validate that all elements are strings non_string_values = [v for v in nominal_values if not isinstance(v, str)] if non_string_values: raise TypeError( - f"All elements in 'nominal_values' must be str, but found non-string values: " - f"{non_string_values}. Expected all strings in list." + f"All elements in 'nominal_values' must be str, but found " + f"non-string values: {non_string_values}." ) elif nominal_values is not None: raise TypeError( - f"Parameter 'nominal_values' must be None for non-nominal data types (got {data_type!r}), " - f"but got {type(nominal_values).__name__}. Value: {nominal_values!r}" + f"Parameter 'nominal_values' must be None for non-nominal data " + f"types (got {data_type!r}), but got {type(nominal_values).__name__}. " + f"Value: {nominal_values!r}" ) # Validate number_missing_values if not isinstance(number_missing_values, int): raise TypeError( f"Parameter 'number_missing_values' must be int, but got " - f"{type(number_missing_values).__name__}. Value: {number_missing_values!r}" + f"{type(number_missing_values).__name__}. " + f"Value: {number_missing_values!r}" ) if number_missing_values < 0: raise ValueError( @@ -123,13 +125,12 @@ def __init__( # noqa: PLR0913 f"{number_missing_values}. Cannot have negative missing values." ) - # Validate ontologies (keep simple - just check if list or None) - if ontologies is not None: - if not isinstance(ontologies, list): - raise TypeError( - f"Parameter 'ontologies' must be list or None, but got {type(ontologies).__name__}. " - f"Value: {ontologies!r}" - ) + # Validate ontologies + if ontologies is not None and not isinstance(ontologies, list): + raise TypeError( + f"Parameter 'ontologies' must be list or None, but got " + f"{type(ontologies).__name__}. Value: {ontologies!r}" + ) # All validations passed, assign attributes self.index = index