From a89af33df1376cf655a89e072c59f5bad8abedb8 Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 12 Mar 2026 16:10:58 +0100 Subject: [PATCH 1/6] Added ranged bar chart for machine learning example function --- .../visualize_training_data.py | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/public/machine_learning/visualize_training_data.py b/public/machine_learning/visualize_training_data.py index 4709d73..c2c23db 100644 --- a/public/machine_learning/visualize_training_data.py +++ b/public/machine_learning/visualize_training_data.py @@ -7,6 +7,7 @@ from simstack.models.charts_artifact import ( ChartArtifactModel, AGBarSeriesConfig, + AGRangeBarSeriesConfig, AGChartAxisConfig, AGChartTitleConfig ) @@ -92,6 +93,55 @@ async def visualize_strain_vs_concentration(dataset: PandasModel, **kwargs): return await _visualize_strain_vs_concentration_internal(dataset, **kwargs) +async def _visualize_impurity_ranges_internal(dataset: PandasModel, **kwargs): + node_runner = kwargs.get("node_runner") + task_id = kwargs.get("task_id") + + df = dataset.table + impurity_cols = ["C_wt_percent", "Mn_wt_percent", "P_wt_percent", "S_wt_percent"] + + min_values = df[impurity_cols].min() + max_values = df[impurity_cols].max() + + chart_data = [] + for col in impurity_cols: + chart_data.append({ + "impurity": col.split("_")[0], + "min_value": float(min_values[col]), + "max_value": float(max_values[col]) + }) + + range_series = [ + AGRangeBarSeriesConfig( + type="range-bar", + xKey="impurity", + yLowKey="min_value", + yHighKey="max_value", + title="Impurity Concentration Range", + data=chart_data + ) + ] + + axes = [ + AGChartAxisConfig(type="category", position="bottom", title="Impurity"), + AGChartAxisConfig(type="number", position="left", title="Concentration (wt%)") + ] + + range_chart = ChartArtifactModel( + title=AGChartTitleConfig(text="Impurity Concentration Min/Max Ranges"), + series=range_series, + axes=axes, + data=chart_data + ) + + if task_id: + range_chart.parent_id = ObjectId(task_id) + await context.db.save(range_chart) + node_runner.info("Saved impurity min/max range chart") + + return range_chart + + async def _visualize_impurity_maxima_internal(dataset: PandasModel, **kwargs): node_runner = kwargs.get("node_runner") task_id = kwargs.get("task_id") @@ -154,9 +204,14 @@ async def _visualize_impurity_maxima_internal(dataset: PandasModel, **kwargs): @node(parameters=Parameters(force_rerun=True)) async def visualize_impurity_maxima(dataset: PandasModel, **kwargs): - chart = await _visualize_impurity_maxima_internal(dataset, **kwargs) - kwargs.get("node_runner").chart = chart - return kwargs.get("node_runner").succeed() + max_chart = await _visualize_impurity_maxima_internal(dataset, **kwargs) + + # We can remove old max_chart and set this new range_chart as node_runner.chart if needed + range_chart = await _visualize_impurity_ranges_internal(dataset, **kwargs) + + node_runner = kwargs.get("node_runner") + node_runner.chart = max_chart + return node_runner.succeed() async def main(): @@ -190,4 +245,4 @@ def fail(self, msg): print(f"FAIL: {msg}") await _visualize_impurity_maxima_internal(dataset, **kwargs) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 9e68f7604512b7ff3e891702628649e00b555904 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 13 Mar 2026 11:33:22 +0100 Subject: [PATCH 2/6] Added heatmaps. #143 --- .../visualize_training_data.py | 117 +++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/public/machine_learning/visualize_training_data.py b/public/machine_learning/visualize_training_data.py index c2c23db..8cbe526 100644 --- a/public/machine_learning/visualize_training_data.py +++ b/public/machine_learning/visualize_training_data.py @@ -7,9 +7,11 @@ from simstack.models.charts_artifact import ( ChartArtifactModel, AGBarSeriesConfig, + AGHeatmapSeriesConfig, AGRangeBarSeriesConfig, AGChartAxisConfig, - AGChartTitleConfig + AGChartTitleConfig, + create_simple_heatmap_chart, ) from simstack.models.table_artifact import TableArtifactModel, AGGridColumnDef from simstack.models.pandas_model import PandasModel @@ -17,6 +19,47 @@ from simstack.models.charts_artifact import create_simple_scatter_chart +async def _save_correlation_heatmap( + corr_frame, + title: str, + task_id, + node_runner, + charts: list, + x_name: str = "Column", + y_name: str = "Row", +): + heatmap_data = [] + for row_name in corr_frame.index: + for col_name in corr_frame.columns: + heatmap_data.append( + { + "x_feature": str(col_name), + "y_feature": str(row_name), + "correlation": float(corr_frame.loc[row_name, col_name]), + } + ) + + heatmap_chart = create_simple_heatmap_chart( + data=heatmap_data, + x_key="x_feature", + y_key="y_feature", + color_key="correlation", + title=title, + parent_id=ObjectId(task_id) if task_id else None, + ) + + if heatmap_chart.series and isinstance(heatmap_chart.series[0], AGHeatmapSeriesConfig): + heatmap_chart.series[0].xName = x_name + heatmap_chart.series[0].yName = y_name + heatmap_chart.series[0].colorName = "Pearson correlation" + heatmap_chart.series[0].colorDomain = [-1.0, 1.0] + heatmap_chart.series[0].colorRange = ["#2166ac", "#f7f7f7", "#b2182b"] + + await context.db.save(heatmap_chart) + charts.append(heatmap_chart) + node_runner.info(f"Saved heatmap: {title}") + + async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kwargs): node_runner = kwargs.get("node_runner") task_id = kwargs.get("task_id") @@ -83,6 +126,77 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw charts.append(corr_table) node_runner.info("Saved correlation matrix table") + await _save_correlation_heatmap( + corr, + title="Impurity / Strain Correlation Heatmap", + task_id=task_id, + node_runner=node_runner, + charts=charts, + x_name="Variable (X)", + y_name="Variable (Y)", + ) + + model_feature_cols = [ + col + for col in [ + "youngs_modulus_MPa", + "yield_strength_MPa", + "ultimate_strength_MPa", + "fracture_stress_MPa", + "fracture_strain", + "uniform_strain", + ] + if col in df.columns + ] + + if len(model_feature_cols) >= 2: + feature_corr = df[model_feature_cols].corr() + await _save_correlation_heatmap( + feature_corr, + title="ML Input Feature Correlation Heatmap", + task_id=task_id, + node_runner=node_runner, + charts=charts, + x_name="Model input feature (X)", + y_name="Model input feature (Y)", + ) + + if model_feature_cols: + feature_target_corr = df[model_feature_cols + impurity_cols].corr().loc[model_feature_cols, impurity_cols] + await _save_correlation_heatmap( + feature_target_corr, + title="ML Feature-to-Target Correlation Heatmap", + task_id=task_id, + node_runner=node_runner, + charts=charts, + x_name="Impurity target", + y_name="Model input feature", + ) + + engineered_feature_data = {} + if all(col in df.columns for col in ("ultimate_strength_MPa", "yield_strength_MPa")): + engineered_feature_data["strength_ratio"] = df["ultimate_strength_MPa"] / df["yield_strength_MPa"] + if all(col in df.columns for col in ("fracture_strain", "ultimate_strength_MPa", "youngs_modulus_MPa")): + engineered_feature_data["strain_margin"] = ( + df["fracture_strain"] - (df["ultimate_strength_MPa"] / df["youngs_modulus_MPa"]) + ) + + if engineered_feature_data: + engineered_df = df[impurity_cols].copy() + for col_name, series in engineered_feature_data.items(): + engineered_df[col_name] = series + engineered_cols = list(engineered_feature_data.keys()) + engineered_corr = engineered_df[engineered_cols + impurity_cols].corr().loc[engineered_cols, impurity_cols] + await _save_correlation_heatmap( + engineered_corr, + title="Engineered Feature-to-Target Correlation Heatmap", + task_id=task_id, + node_runner=node_runner, + charts=charts, + x_name="Impurity target", + y_name="Engineered feature", + ) + if hasattr(node_runner, 'result'): node_runner.result = {"charts_count": len(charts), "correlation_matrix": corr.to_dict()} return charts @@ -243,6 +357,7 @@ def fail(self, msg): print(f"FAIL: {msg}") print("\n--- Visualizing Impurity Maxima ---") await _visualize_impurity_maxima_internal(dataset, **kwargs) + await _visualize_impurity_ranges_internal(dataset, **kwargs) if __name__ == "__main__": asyncio.run(main()) From 543c51573af0707918622163cf751e035baffe7b Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 13 Mar 2026 12:20:08 +0100 Subject: [PATCH 3/6] Add ML heatmaps for onboarding demo --- public/machine_learning/ml_training_helper.py | 200 +++++++++++++++++- .../visualize_training_data.py | 175 +++++++++++++-- 2 files changed, 360 insertions(+), 15 deletions(-) diff --git a/public/machine_learning/ml_training_helper.py b/public/machine_learning/ml_training_helper.py index 2ce23bb..a292d89 100644 --- a/public/machine_learning/ml_training_helper.py +++ b/public/machine_learning/ml_training_helper.py @@ -3,10 +3,11 @@ from typing import Any import numpy as np +import pandas as pd from odmantic import ObjectId from simstack.core.context import context from simstack.models import BooleanData -from simstack.models.charts_artifact import create_simple_scatter_chart +from simstack.models.charts_artifact import AGHeatmapSeriesConfig, create_simple_heatmap_chart, create_simple_scatter_chart from simstack.models.pandas_model import PandasModel from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split @@ -50,6 +51,112 @@ async def save_scatter_plot(y_true, y_pred, title, x_label, y_label, task_id, ** await context.db.save(chart) return chart + +async def save_heatmap_plot( + data, + *, + x_key, + y_key, + color_key, + title, + task_id, + x_name, + y_name, + color_name, + color_domain=None, + color_range=None, +): + chart = create_simple_heatmap_chart( + data=data, + x_key=x_key, + y_key=y_key, + color_key=color_key, + title=title, + parent_id=ObjectId(task_id) if task_id else None, + ) + if chart.series and isinstance(chart.series[0], AGHeatmapSeriesConfig): + chart.series[0].xName = x_name + chart.series[0].yName = y_name + chart.series[0].colorName = color_name + if color_domain is not None: + chart.series[0].colorDomain = color_domain + if color_range is not None: + chart.series[0].colorRange = color_range + await context.db.save(chart) + return chart + + +def _format_interval_label(interval) -> str: + return f"{interval.left:.3g} to {interval.right:.3g}" + + +def _build_prediction_density_heatmap_data(y_true, y_pred, *, bins: int = 12): + actual = np.asarray(y_true, dtype=float).reshape(-1) + predicted = np.asarray(y_pred, dtype=float).reshape(-1) + combined = np.concatenate([actual, predicted]) + if combined.size == 0 or np.nanmin(combined) == np.nanmax(combined): + return [] + + bin_count = min(bins, max(2, np.unique(combined).size)) + bin_edges = np.linspace(np.nanmin(combined), np.nanmax(combined), bin_count + 1) + frame = pd.DataFrame( + { + "actual_bin": pd.cut(actual, bins=bin_edges, include_lowest=True, duplicates="drop"), + "predicted_bin": pd.cut(predicted, bins=bin_edges, include_lowest=True, duplicates="drop"), + } + ).dropna() + if frame.empty: + return [] + + grouped = frame.groupby(["actual_bin", "predicted_bin"], observed=True).size().reset_index(name="sample_count") + return [ + { + "actual_bin": _format_interval_label(row["actual_bin"]), + "predicted_bin": _format_interval_label(row["predicted_bin"]), + "sample_count": int(row["sample_count"]), + } + for _, row in grouped.iterrows() + ] + + +def _build_property_error_heatmap_data(x_values, y_values, error_values, *, bins: int = 10): + frame = pd.DataFrame( + { + "x_value": np.asarray(x_values, dtype=float).reshape(-1), + "y_value": np.asarray(y_values, dtype=float).reshape(-1), + "abs_error": np.asarray(error_values, dtype=float).reshape(-1), + } + ).dropna() + if frame.empty: + return [] + + x_bin_count = min(bins, int(frame["x_value"].nunique())) + y_bin_count = min(bins, int(frame["y_value"].nunique())) + if x_bin_count < 2 or y_bin_count < 2: + return [] + + frame["x_bin"] = pd.cut(frame["x_value"], bins=x_bin_count, duplicates="drop") + frame["y_bin"] = pd.cut(frame["y_value"], bins=y_bin_count, duplicates="drop") + grouped = ( + frame.dropna(subset=["x_bin", "y_bin"]) + .groupby(["x_bin", "y_bin"], observed=True)["abs_error"] + .mean() + .reset_index(name="mean_abs_error") + ) + return [ + { + "x_bin": _format_interval_label(row["x_bin"]), + "y_bin": _format_interval_label(row["y_bin"]), + "mean_abs_error": float(row["mean_abs_error"]), + } + for _, row in grouped.iterrows() + ] + + +def _target_label(target: str) -> str: + return target.replace("_wt_percent", "").replace("_", " ") + + default_element_selector = ElementSelector(use_C=True, use_Mn=True, use_P=True, use_S=True) class RegressionAnalysis: @@ -102,7 +209,7 @@ async def make_model_data(self): # 2. Define Features and Targets # Drop rows with NaN if any (though synthetic data should be clean) - df_clean = df[self.features + self.targets].dropna() + df_clean = df[self.features + self.targets].dropna().copy() X_cols = self.features.copy() if self.use_engineered_features: @@ -157,19 +264,33 @@ async def model_analysis(self, model: Any) -> RegressionResult: y_pred_test = y_pred_test_raw metrics = {} + performance_rows = [] for i, target in enumerate(self.targets): # Metrics y_true_test = self.y_test.iloc[:, i] if len(self.targets) > 1 else self.y_test + y_true_train = self.y_train.iloc[:, i] if len(self.targets) > 1 else self.y_train + train_mse = mean_squared_error(y_true_train, y_pred_train[:, i]) + train_r2 = r2_score(y_true_train, y_pred_train[:, i]) mse = mean_squared_error(y_true_test, y_pred_test[:, i]) r2 = r2_score(y_true_test, y_pred_test[:, i]) + train_rmse = float(np.sqrt(train_mse)) + test_rmse = float(np.sqrt(mse)) metrics[f"{target}_mse"] = float(mse) metrics[f"{target}_r2"] = float(r2) + metrics[f"{target}_train_r2"] = float(train_r2) + metrics[f"{target}_train_rmse"] = train_rmse + metrics[f"{target}_test_rmse"] = test_rmse self.node_runner.info(f"Target {target}: MSE={mse:.6f}, R2={r2:.4f}") + performance_rows.extend( + [ + {"target": _target_label(target), "split": "Train", "r2": float(train_r2), "rmse": train_rmse}, + {"target": _target_label(target), "split": "Test", "r2": float(r2), "rmse": test_rmse}, + ] + ) # Scatter Plots using ChartArtifactModel # Training data - y_true_train = self.y_train.iloc[:, i] if len(self.targets) > 1 else self.y_train await save_scatter_plot( y_true_train, y_pred_train[:, i], title=f"Train: {target}", @@ -185,11 +306,84 @@ async def model_analysis(self, model: Any) -> RegressionResult: task_id=self.task_id ) + density_heatmap_data = _build_prediction_density_heatmap_data(y_true_test, y_pred_test[:, i]) + if density_heatmap_data: + await save_heatmap_plot( + density_heatmap_data, + x_key="actual_bin", + y_key="predicted_bin", + color_key="sample_count", + title=f"Actual vs Predicted Density Heatmap: {_target_label(target)}", + task_id=self.task_id, + x_name="Actual value bin", + y_name="Predicted value bin", + color_name="Sample count", + color_range=["#f7fbff", "#6baed6", "#08306b"], + ) + + if "yield_strength_MPa" in self.X_test.columns and "fracture_strain" in self.X_test.columns: + error_heatmap_data = _build_property_error_heatmap_data( + self.X_test["yield_strength_MPa"], + self.X_test["fracture_strain"], + np.abs(np.asarray(y_true_test, dtype=float).reshape(-1) - y_pred_test[:, i]), + ) + if error_heatmap_data: + await save_heatmap_plot( + error_heatmap_data, + x_key="x_bin", + y_key="y_bin", + color_key="mean_abs_error", + title=f"Prediction Error in Property Space: {_target_label(target)}", + task_id=self.task_id, + x_name="Yield strength bin (MPa)", + y_name="Fracture strain bin", + color_name="Mean absolute error", + color_range=["#fff5eb", "#fdae6b", "#a63603"], + ) + # Total metrics total_r2 = float(r2_score(self.y_test, y_pred_test, multioutput='uniform_average')) self.node_runner.info(f"Average R2 Score: {total_r2:.4f}") metrics["average_r2"] = total_r2 + if performance_rows: + r2_heatmap_data = [ + {"target": row["target"], "split": row["split"], "r2": row["r2"]} + for row in performance_rows + ] + await save_heatmap_plot( + r2_heatmap_data, + x_key="target", + y_key="split", + color_key="r2", + title="Prediction R2 Heatmap", + task_id=self.task_id, + x_name="Impurity target", + y_name="Dataset split", + color_name="R2 score", + color_domain=[-1.0, 1.0], + color_range=["#b2182b", "#f7f7f7", "#2166ac"], + ) + + rmse_heatmap_data = [ + {"target": row["target"], "split": row["split"], "rmse": row["rmse"]} + for row in performance_rows + ] + max_rmse = max((row["rmse"] for row in performance_rows), default=0.0) + await save_heatmap_plot( + rmse_heatmap_data, + x_key="target", + y_key="split", + color_key="rmse", + title="Prediction RMSE Heatmap", + task_id=self.task_id, + x_name="Impurity target", + y_name="Dataset split", + color_name="RMSE", + color_domain=[0.0, max_rmse] if max_rmse > 0 else None, + color_range=["#ffffcc", "#fd8d3c", "#800026"], + ) + # 6. Save results # Save model to local file target_suffix = "_".join(self.targets) diff --git a/public/machine_learning/visualize_training_data.py b/public/machine_learning/visualize_training_data.py index 8cbe526..9bb46da 100644 --- a/public/machine_learning/visualize_training_data.py +++ b/public/machine_learning/visualize_training_data.py @@ -1,6 +1,7 @@ import asyncio from odmantic import ObjectId +import pandas as pd from simstack.core.context import context from simstack.core.node import node from simstack.models import Parameters @@ -19,23 +20,49 @@ from simstack.models.charts_artifact import create_simple_scatter_chart -async def _save_correlation_heatmap( - corr_frame, +def _configure_heatmap_series( + heatmap_chart, + *, + x_name: str, + y_name: str, + color_name: str, + color_domain=None, + color_range=None, +): + if heatmap_chart.series and isinstance(heatmap_chart.series[0], AGHeatmapSeriesConfig): + heatmap_chart.series[0].xName = x_name + heatmap_chart.series[0].yName = y_name + heatmap_chart.series[0].colorName = color_name + if color_domain is not None: + heatmap_chart.series[0].colorDomain = color_domain + if color_range is not None: + heatmap_chart.series[0].colorRange = color_range + + +async def _save_matrix_heatmap( + matrix_frame, title: str, task_id, node_runner, charts: list, + value_key: str, + color_name: str, x_name: str = "Column", y_name: str = "Row", + color_domain=None, + color_range=None, ): heatmap_data = [] - for row_name in corr_frame.index: - for col_name in corr_frame.columns: + for row_name in matrix_frame.index: + for col_name in matrix_frame.columns: + value = matrix_frame.loc[row_name, col_name] + if pd.isna(value): + continue heatmap_data.append( { "x_feature": str(col_name), "y_feature": str(row_name), - "correlation": float(corr_frame.loc[row_name, col_name]), + value_key: float(value), } ) @@ -43,17 +70,112 @@ async def _save_correlation_heatmap( data=heatmap_data, x_key="x_feature", y_key="y_feature", - color_key="correlation", + color_key=value_key, title=title, parent_id=ObjectId(task_id) if task_id else None, ) - if heatmap_chart.series and isinstance(heatmap_chart.series[0], AGHeatmapSeriesConfig): - heatmap_chart.series[0].xName = x_name - heatmap_chart.series[0].yName = y_name - heatmap_chart.series[0].colorName = "Pearson correlation" - heatmap_chart.series[0].colorDomain = [-1.0, 1.0] - heatmap_chart.series[0].colorRange = ["#2166ac", "#f7f7f7", "#b2182b"] + _configure_heatmap_series( + heatmap_chart, + x_name=x_name, + y_name=y_name, + color_name=color_name, + color_domain=color_domain, + color_range=color_range, + ) + + await context.db.save(heatmap_chart) + charts.append(heatmap_chart) + node_runner.info(f"Saved heatmap: {title}") + + +async def _save_correlation_heatmap( + corr_frame, + title: str, + task_id, + node_runner, + charts: list, + x_name: str = "Column", + y_name: str = "Row", + color_name: str = "Pearson correlation", +): + await _save_matrix_heatmap( + corr_frame, + title=title, + task_id=task_id, + node_runner=node_runner, + charts=charts, + value_key="correlation", + color_name=color_name, + x_name=x_name, + y_name=y_name, + color_domain=[-1.0, 1.0], + color_range=["#2166ac", "#f7f7f7", "#b2182b"], + ) + + +def _format_interval_label(interval) -> str: + return f"{interval.left:.3g} to {interval.right:.3g}" + + +async def _save_property_space_impurity_heatmap( + df: pd.DataFrame, + *, + x_col: str, + y_col: str, + impurity_col: str, + title: str, + task_id, + node_runner, + charts: list, + bins: int = 10, +): + clean_df = df[[x_col, y_col, impurity_col]].dropna().copy() + if clean_df.empty: + return + + x_bin_count = min(bins, int(clean_df[x_col].nunique())) + y_bin_count = min(bins, int(clean_df[y_col].nunique())) + if x_bin_count < 2 or y_bin_count < 2: + return + + clean_df["x_bin"] = pd.cut(clean_df[x_col], bins=x_bin_count, duplicates="drop") + clean_df["y_bin"] = pd.cut(clean_df[y_col], bins=y_bin_count, duplicates="drop") + grouped = ( + clean_df.dropna(subset=["x_bin", "y_bin"]) + .groupby(["x_bin", "y_bin"], observed=True)[impurity_col] + .mean() + .reset_index(name="average_impurity") + ) + if grouped.empty: + return + + heatmap_data = [] + for _, row in grouped.iterrows(): + heatmap_data.append( + { + "x_bin": _format_interval_label(row["x_bin"]), + "y_bin": _format_interval_label(row["y_bin"]), + "average_impurity": float(row["average_impurity"]), + } + ) + + heatmap_chart = create_simple_heatmap_chart( + data=heatmap_data, + x_key="x_bin", + y_key="y_bin", + color_key="average_impurity", + title=title, + parent_id=ObjectId(task_id) if task_id else None, + ) + impurity_label = impurity_col.split("_")[0] + _configure_heatmap_series( + heatmap_chart, + x_name="Yield strength bin (MPa)", + y_name="Fracture strain bin", + color_name=f"Average {impurity_label} concentration (wt%)", + color_range=["#fff7ec", "#fdbb84", "#d7301f"], + ) await context.db.save(heatmap_chart) charts.append(heatmap_chart) @@ -172,6 +294,21 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw x_name="Impurity target", y_name="Model input feature", ) + spearman_feature_target_corr = ( + df[model_feature_cols + impurity_cols] + .corr(method="spearman") + .loc[model_feature_cols, impurity_cols] + ) + await _save_correlation_heatmap( + spearman_feature_target_corr, + title="Spearman ML Feature-to-Target Heatmap", + task_id=task_id, + node_runner=node_runner, + charts=charts, + x_name="Impurity target", + y_name="Model input feature", + color_name="Spearman correlation", + ) engineered_feature_data = {} if all(col in df.columns for col in ("ultimate_strength_MPa", "yield_strength_MPa")): @@ -197,6 +334,20 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw y_name="Engineered feature", ) + if all(col in df.columns for col in ("yield_strength_MPa", "fracture_strain")): + for impurity_col in impurity_cols: + impurity_label = impurity_col.split("_")[0] + await _save_property_space_impurity_heatmap( + df, + x_col="yield_strength_MPa", + y_col="fracture_strain", + impurity_col=impurity_col, + title=f"Property-Space Heatmap: Average {impurity_label} Concentration", + task_id=task_id, + node_runner=node_runner, + charts=charts, + ) + if hasattr(node_runner, 'result'): node_runner.result = {"charts_count": len(charts), "correlation_matrix": corr.to_dict()} return charts From bca44c83a1be0961e8262f3308fb3607b27647db Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 13 Mar 2026 12:27:16 +0100 Subject: [PATCH 4/6] Keep onboarding heatmaps in visualization node only --- public/machine_learning/ml_training_helper.py | 200 +----------------- 1 file changed, 3 insertions(+), 197 deletions(-) diff --git a/public/machine_learning/ml_training_helper.py b/public/machine_learning/ml_training_helper.py index a292d89..2ce23bb 100644 --- a/public/machine_learning/ml_training_helper.py +++ b/public/machine_learning/ml_training_helper.py @@ -3,11 +3,10 @@ from typing import Any import numpy as np -import pandas as pd from odmantic import ObjectId from simstack.core.context import context from simstack.models import BooleanData -from simstack.models.charts_artifact import AGHeatmapSeriesConfig, create_simple_heatmap_chart, create_simple_scatter_chart +from simstack.models.charts_artifact import create_simple_scatter_chart from simstack.models.pandas_model import PandasModel from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split @@ -51,112 +50,6 @@ async def save_scatter_plot(y_true, y_pred, title, x_label, y_label, task_id, ** await context.db.save(chart) return chart - -async def save_heatmap_plot( - data, - *, - x_key, - y_key, - color_key, - title, - task_id, - x_name, - y_name, - color_name, - color_domain=None, - color_range=None, -): - chart = create_simple_heatmap_chart( - data=data, - x_key=x_key, - y_key=y_key, - color_key=color_key, - title=title, - parent_id=ObjectId(task_id) if task_id else None, - ) - if chart.series and isinstance(chart.series[0], AGHeatmapSeriesConfig): - chart.series[0].xName = x_name - chart.series[0].yName = y_name - chart.series[0].colorName = color_name - if color_domain is not None: - chart.series[0].colorDomain = color_domain - if color_range is not None: - chart.series[0].colorRange = color_range - await context.db.save(chart) - return chart - - -def _format_interval_label(interval) -> str: - return f"{interval.left:.3g} to {interval.right:.3g}" - - -def _build_prediction_density_heatmap_data(y_true, y_pred, *, bins: int = 12): - actual = np.asarray(y_true, dtype=float).reshape(-1) - predicted = np.asarray(y_pred, dtype=float).reshape(-1) - combined = np.concatenate([actual, predicted]) - if combined.size == 0 or np.nanmin(combined) == np.nanmax(combined): - return [] - - bin_count = min(bins, max(2, np.unique(combined).size)) - bin_edges = np.linspace(np.nanmin(combined), np.nanmax(combined), bin_count + 1) - frame = pd.DataFrame( - { - "actual_bin": pd.cut(actual, bins=bin_edges, include_lowest=True, duplicates="drop"), - "predicted_bin": pd.cut(predicted, bins=bin_edges, include_lowest=True, duplicates="drop"), - } - ).dropna() - if frame.empty: - return [] - - grouped = frame.groupby(["actual_bin", "predicted_bin"], observed=True).size().reset_index(name="sample_count") - return [ - { - "actual_bin": _format_interval_label(row["actual_bin"]), - "predicted_bin": _format_interval_label(row["predicted_bin"]), - "sample_count": int(row["sample_count"]), - } - for _, row in grouped.iterrows() - ] - - -def _build_property_error_heatmap_data(x_values, y_values, error_values, *, bins: int = 10): - frame = pd.DataFrame( - { - "x_value": np.asarray(x_values, dtype=float).reshape(-1), - "y_value": np.asarray(y_values, dtype=float).reshape(-1), - "abs_error": np.asarray(error_values, dtype=float).reshape(-1), - } - ).dropna() - if frame.empty: - return [] - - x_bin_count = min(bins, int(frame["x_value"].nunique())) - y_bin_count = min(bins, int(frame["y_value"].nunique())) - if x_bin_count < 2 or y_bin_count < 2: - return [] - - frame["x_bin"] = pd.cut(frame["x_value"], bins=x_bin_count, duplicates="drop") - frame["y_bin"] = pd.cut(frame["y_value"], bins=y_bin_count, duplicates="drop") - grouped = ( - frame.dropna(subset=["x_bin", "y_bin"]) - .groupby(["x_bin", "y_bin"], observed=True)["abs_error"] - .mean() - .reset_index(name="mean_abs_error") - ) - return [ - { - "x_bin": _format_interval_label(row["x_bin"]), - "y_bin": _format_interval_label(row["y_bin"]), - "mean_abs_error": float(row["mean_abs_error"]), - } - for _, row in grouped.iterrows() - ] - - -def _target_label(target: str) -> str: - return target.replace("_wt_percent", "").replace("_", " ") - - default_element_selector = ElementSelector(use_C=True, use_Mn=True, use_P=True, use_S=True) class RegressionAnalysis: @@ -209,7 +102,7 @@ async def make_model_data(self): # 2. Define Features and Targets # Drop rows with NaN if any (though synthetic data should be clean) - df_clean = df[self.features + self.targets].dropna().copy() + df_clean = df[self.features + self.targets].dropna() X_cols = self.features.copy() if self.use_engineered_features: @@ -264,33 +157,19 @@ async def model_analysis(self, model: Any) -> RegressionResult: y_pred_test = y_pred_test_raw metrics = {} - performance_rows = [] for i, target in enumerate(self.targets): # Metrics y_true_test = self.y_test.iloc[:, i] if len(self.targets) > 1 else self.y_test - y_true_train = self.y_train.iloc[:, i] if len(self.targets) > 1 else self.y_train - train_mse = mean_squared_error(y_true_train, y_pred_train[:, i]) - train_r2 = r2_score(y_true_train, y_pred_train[:, i]) mse = mean_squared_error(y_true_test, y_pred_test[:, i]) r2 = r2_score(y_true_test, y_pred_test[:, i]) - train_rmse = float(np.sqrt(train_mse)) - test_rmse = float(np.sqrt(mse)) metrics[f"{target}_mse"] = float(mse) metrics[f"{target}_r2"] = float(r2) - metrics[f"{target}_train_r2"] = float(train_r2) - metrics[f"{target}_train_rmse"] = train_rmse - metrics[f"{target}_test_rmse"] = test_rmse self.node_runner.info(f"Target {target}: MSE={mse:.6f}, R2={r2:.4f}") - performance_rows.extend( - [ - {"target": _target_label(target), "split": "Train", "r2": float(train_r2), "rmse": train_rmse}, - {"target": _target_label(target), "split": "Test", "r2": float(r2), "rmse": test_rmse}, - ] - ) # Scatter Plots using ChartArtifactModel # Training data + y_true_train = self.y_train.iloc[:, i] if len(self.targets) > 1 else self.y_train await save_scatter_plot( y_true_train, y_pred_train[:, i], title=f"Train: {target}", @@ -306,84 +185,11 @@ async def model_analysis(self, model: Any) -> RegressionResult: task_id=self.task_id ) - density_heatmap_data = _build_prediction_density_heatmap_data(y_true_test, y_pred_test[:, i]) - if density_heatmap_data: - await save_heatmap_plot( - density_heatmap_data, - x_key="actual_bin", - y_key="predicted_bin", - color_key="sample_count", - title=f"Actual vs Predicted Density Heatmap: {_target_label(target)}", - task_id=self.task_id, - x_name="Actual value bin", - y_name="Predicted value bin", - color_name="Sample count", - color_range=["#f7fbff", "#6baed6", "#08306b"], - ) - - if "yield_strength_MPa" in self.X_test.columns and "fracture_strain" in self.X_test.columns: - error_heatmap_data = _build_property_error_heatmap_data( - self.X_test["yield_strength_MPa"], - self.X_test["fracture_strain"], - np.abs(np.asarray(y_true_test, dtype=float).reshape(-1) - y_pred_test[:, i]), - ) - if error_heatmap_data: - await save_heatmap_plot( - error_heatmap_data, - x_key="x_bin", - y_key="y_bin", - color_key="mean_abs_error", - title=f"Prediction Error in Property Space: {_target_label(target)}", - task_id=self.task_id, - x_name="Yield strength bin (MPa)", - y_name="Fracture strain bin", - color_name="Mean absolute error", - color_range=["#fff5eb", "#fdae6b", "#a63603"], - ) - # Total metrics total_r2 = float(r2_score(self.y_test, y_pred_test, multioutput='uniform_average')) self.node_runner.info(f"Average R2 Score: {total_r2:.4f}") metrics["average_r2"] = total_r2 - if performance_rows: - r2_heatmap_data = [ - {"target": row["target"], "split": row["split"], "r2": row["r2"]} - for row in performance_rows - ] - await save_heatmap_plot( - r2_heatmap_data, - x_key="target", - y_key="split", - color_key="r2", - title="Prediction R2 Heatmap", - task_id=self.task_id, - x_name="Impurity target", - y_name="Dataset split", - color_name="R2 score", - color_domain=[-1.0, 1.0], - color_range=["#b2182b", "#f7f7f7", "#2166ac"], - ) - - rmse_heatmap_data = [ - {"target": row["target"], "split": row["split"], "rmse": row["rmse"]} - for row in performance_rows - ] - max_rmse = max((row["rmse"] for row in performance_rows), default=0.0) - await save_heatmap_plot( - rmse_heatmap_data, - x_key="target", - y_key="split", - color_key="rmse", - title="Prediction RMSE Heatmap", - task_id=self.task_id, - x_name="Impurity target", - y_name="Dataset split", - color_name="RMSE", - color_domain=[0.0, max_rmse] if max_rmse > 0 else None, - color_range=["#ffffcc", "#fd8d3c", "#800026"], - ) - # 6. Save results # Save model to local file target_suffix = "_".join(self.targets) From 1b65f3122ee50d0dc77ed4650f62525cf81b0009 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 13 Mar 2026 13:51:26 +0100 Subject: [PATCH 5/6] Keep only Spearman heatmap in onboarding visualization --- .../visualize_training_data.py | 186 +----------------- 1 file changed, 9 insertions(+), 177 deletions(-) diff --git a/public/machine_learning/visualize_training_data.py b/public/machine_learning/visualize_training_data.py index 9bb46da..c9d833b 100644 --- a/public/machine_learning/visualize_training_data.py +++ b/public/machine_learning/visualize_training_data.py @@ -1,7 +1,6 @@ import asyncio from odmantic import ObjectId -import pandas as pd from simstack.core.context import context from simstack.core.node import node from simstack.models import Parameters @@ -39,30 +38,27 @@ def _configure_heatmap_series( heatmap_chart.series[0].colorRange = color_range -async def _save_matrix_heatmap( - matrix_frame, +async def _save_correlation_heatmap( + corr_frame, title: str, task_id, node_runner, charts: list, - value_key: str, - color_name: str, x_name: str = "Column", y_name: str = "Row", - color_domain=None, - color_range=None, + color_name: str = "Correlation", ): heatmap_data = [] - for row_name in matrix_frame.index: - for col_name in matrix_frame.columns: - value = matrix_frame.loc[row_name, col_name] - if pd.isna(value): + for row_name in corr_frame.index: + for col_name in corr_frame.columns: + value = corr_frame.loc[row_name, col_name] + if value != value: continue heatmap_data.append( { "x_feature": str(col_name), "y_feature": str(row_name), - value_key: float(value), + "correlation": float(value), } ) @@ -70,113 +66,19 @@ async def _save_matrix_heatmap( data=heatmap_data, x_key="x_feature", y_key="y_feature", - color_key=value_key, + color_key="correlation", title=title, parent_id=ObjectId(task_id) if task_id else None, ) - _configure_heatmap_series( heatmap_chart, x_name=x_name, y_name=y_name, color_name=color_name, - color_domain=color_domain, - color_range=color_range, - ) - - await context.db.save(heatmap_chart) - charts.append(heatmap_chart) - node_runner.info(f"Saved heatmap: {title}") - - -async def _save_correlation_heatmap( - corr_frame, - title: str, - task_id, - node_runner, - charts: list, - x_name: str = "Column", - y_name: str = "Row", - color_name: str = "Pearson correlation", -): - await _save_matrix_heatmap( - corr_frame, - title=title, - task_id=task_id, - node_runner=node_runner, - charts=charts, - value_key="correlation", - color_name=color_name, - x_name=x_name, - y_name=y_name, color_domain=[-1.0, 1.0], color_range=["#2166ac", "#f7f7f7", "#b2182b"], ) - -def _format_interval_label(interval) -> str: - return f"{interval.left:.3g} to {interval.right:.3g}" - - -async def _save_property_space_impurity_heatmap( - df: pd.DataFrame, - *, - x_col: str, - y_col: str, - impurity_col: str, - title: str, - task_id, - node_runner, - charts: list, - bins: int = 10, -): - clean_df = df[[x_col, y_col, impurity_col]].dropna().copy() - if clean_df.empty: - return - - x_bin_count = min(bins, int(clean_df[x_col].nunique())) - y_bin_count = min(bins, int(clean_df[y_col].nunique())) - if x_bin_count < 2 or y_bin_count < 2: - return - - clean_df["x_bin"] = pd.cut(clean_df[x_col], bins=x_bin_count, duplicates="drop") - clean_df["y_bin"] = pd.cut(clean_df[y_col], bins=y_bin_count, duplicates="drop") - grouped = ( - clean_df.dropna(subset=["x_bin", "y_bin"]) - .groupby(["x_bin", "y_bin"], observed=True)[impurity_col] - .mean() - .reset_index(name="average_impurity") - ) - if grouped.empty: - return - - heatmap_data = [] - for _, row in grouped.iterrows(): - heatmap_data.append( - { - "x_bin": _format_interval_label(row["x_bin"]), - "y_bin": _format_interval_label(row["y_bin"]), - "average_impurity": float(row["average_impurity"]), - } - ) - - heatmap_chart = create_simple_heatmap_chart( - data=heatmap_data, - x_key="x_bin", - y_key="y_bin", - color_key="average_impurity", - title=title, - parent_id=ObjectId(task_id) if task_id else None, - ) - impurity_label = impurity_col.split("_")[0] - _configure_heatmap_series( - heatmap_chart, - x_name="Yield strength bin (MPa)", - y_name="Fracture strain bin", - color_name=f"Average {impurity_label} concentration (wt%)", - color_range=["#fff7ec", "#fdbb84", "#d7301f"], - ) - await context.db.save(heatmap_chart) charts.append(heatmap_chart) node_runner.info(f"Saved heatmap: {title}") @@ -248,16 +150,6 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw charts.append(corr_table) node_runner.info("Saved correlation matrix table") - await _save_correlation_heatmap( - corr, - title="Impurity / Strain Correlation Heatmap", - task_id=task_id, - node_runner=node_runner, - charts=charts, - x_name="Variable (X)", - y_name="Variable (Y)", - ) - model_feature_cols = [ col for col in [ @@ -271,29 +163,7 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw if col in df.columns ] - if len(model_feature_cols) >= 2: - feature_corr = df[model_feature_cols].corr() - await _save_correlation_heatmap( - feature_corr, - title="ML Input Feature Correlation Heatmap", - task_id=task_id, - node_runner=node_runner, - charts=charts, - x_name="Model input feature (X)", - y_name="Model input feature (Y)", - ) - if model_feature_cols: - feature_target_corr = df[model_feature_cols + impurity_cols].corr().loc[model_feature_cols, impurity_cols] - await _save_correlation_heatmap( - feature_target_corr, - title="ML Feature-to-Target Correlation Heatmap", - task_id=task_id, - node_runner=node_runner, - charts=charts, - x_name="Impurity target", - y_name="Model input feature", - ) spearman_feature_target_corr = ( df[model_feature_cols + impurity_cols] .corr(method="spearman") @@ -310,44 +180,6 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw color_name="Spearman correlation", ) - engineered_feature_data = {} - if all(col in df.columns for col in ("ultimate_strength_MPa", "yield_strength_MPa")): - engineered_feature_data["strength_ratio"] = df["ultimate_strength_MPa"] / df["yield_strength_MPa"] - if all(col in df.columns for col in ("fracture_strain", "ultimate_strength_MPa", "youngs_modulus_MPa")): - engineered_feature_data["strain_margin"] = ( - df["fracture_strain"] - (df["ultimate_strength_MPa"] / df["youngs_modulus_MPa"]) - ) - - if engineered_feature_data: - engineered_df = df[impurity_cols].copy() - for col_name, series in engineered_feature_data.items(): - engineered_df[col_name] = series - engineered_cols = list(engineered_feature_data.keys()) - engineered_corr = engineered_df[engineered_cols + impurity_cols].corr().loc[engineered_cols, impurity_cols] - await _save_correlation_heatmap( - engineered_corr, - title="Engineered Feature-to-Target Correlation Heatmap", - task_id=task_id, - node_runner=node_runner, - charts=charts, - x_name="Impurity target", - y_name="Engineered feature", - ) - - if all(col in df.columns for col in ("yield_strength_MPa", "fracture_strain")): - for impurity_col in impurity_cols: - impurity_label = impurity_col.split("_")[0] - await _save_property_space_impurity_heatmap( - df, - x_col="yield_strength_MPa", - y_col="fracture_strain", - impurity_col=impurity_col, - title=f"Property-Space Heatmap: Average {impurity_label} Concentration", - task_id=task_id, - node_runner=node_runner, - charts=charts, - ) - if hasattr(node_runner, 'result'): node_runner.result = {"charts_count": len(charts), "correlation_matrix": corr.to_dict()} return charts From b6db956e4ca4bc75215a14ceea6dd846a5581f9a Mon Sep 17 00:00:00 2001 From: Nick Date: Tue, 17 Mar 2026 13:04:37 +0100 Subject: [PATCH 6/6] Improved heatmap axis labels and the chart name. #143 --- public/machine_learning/visualize_training_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/machine_learning/visualize_training_data.py b/public/machine_learning/visualize_training_data.py index c9d833b..455b351 100644 --- a/public/machine_learning/visualize_training_data.py +++ b/public/machine_learning/visualize_training_data.py @@ -171,12 +171,12 @@ async def _visualize_strain_vs_concentration_internal(dataset: PandasModel, **kw ) await _save_correlation_heatmap( spearman_feature_target_corr, - title="Spearman ML Feature-to-Target Heatmap", + title="Spearman Correlation: Concentrations vs Stress-Strain Features", task_id=task_id, node_runner=node_runner, charts=charts, - x_name="Impurity target", - y_name="Model input feature", + x_name="Concentration", + y_name="Stress-Strain Feature", color_name="Spearman correlation", )