From 3ee35275a23663e939dd7439bb36bc2b8cae2bc1 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Wed, 25 Mar 2026 18:02:58 +0100 Subject: [PATCH 01/13] feat(kp): add SIDC source --- swvo/io/kp/__init__.py | 1 + swvo/io/kp/sidc.py | 325 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100755 swvo/io/kp/sidc.py diff --git a/swvo/io/kp/__init__.py b/swvo/io/kp/__init__.py index 9549a15..9353b1d 100755 --- a/swvo/io/kp/__init__.py +++ b/swvo/io/kp/__init__.py @@ -6,6 +6,7 @@ from swvo.io.kp.niemegk import KpNiemegk as KpNiemegk from swvo.io.kp.omni import KpOMNI as KpOMNI from swvo.io.kp.swpc import KpSWPC as KpSWPC +from swvo.io.kp.sidc import KpSIDC as KpSIDC # This has to be imported after the models to avoid a circular import from swvo.io.kp.read_kp_from_multiple_models import read_kp_from_multiple_models as read_kp_from_multiple_models # noqa: I001 diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py new file mode 100755 index 0000000..96561d5 --- /dev/null +++ b/swvo/io/kp/sidc.py @@ -0,0 +1,325 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Module for handling SIDC Kp data. +""" + +import logging +import warnings +from datetime import datetime, timedelta, timezone +from pathlib import Path +from shutil import rmtree +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +import requests + +from swvo.io.base import BaseIO +from swvo.io.utils import enforce_utc_timezone + +logger = logging.getLogger(__name__) + +logging.captureWarnings(True) + + +class KpSIDC(BaseIO): + """A class to handle SIDC Kp data. + + Parameters + ---------- + data_dir : Path | None + Data directory for the SIDC Kp data. If not provided, it will be read from the environment variable + + Methods + ------- + download_and_process + read + + Raises + ------ + ValueError + Returns `ValueError` if necessary environment variable is not set. + """ + + ENV_VAR_NAME = "RT_KP_SIDC_STREAM_DIR" + + URL = "https://ssa.sidc.be/prod/API/index.php?component=latest&pc=G158&psc=a" + NAME = "kp.json" + + DAYS_TO_SAVE_EACH_FILE = 3 + LABEL = "sidc" + + def download_and_process( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, reprocess_files: bool = False + ) -> None: + """Download and process SIDC Kp data file. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to download and process. + end_time : Optional[datetime] + End time of the data to download and process. + reprocess_files : bool, optional + Downloads and processes the files again, defaults to False, by default False + + Raises + ------ + FileNotFoundError + Raise `FileNotFoundError` if the file is not downloaded successfully. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + temporary_dir = Path("./temp_kp_sidc_wget") + temporary_dir.mkdir(exist_ok=True, parents=True) + + logger.debug(f"Downloading file {self.URL} ...") + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + for file_path, time_interval in zip(file_paths, time_intervals): + if file_path.exists() and not reprocess_files: + continue + tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") + try: + self._download(temporary_dir, start_time, end_time) + + # check if download was successfull + json_file = temporary_dir / self.NAME + if not json_file.exists() or json_file.stat().st_size == 0: + raise FileNotFoundError(f"Error while downloading file: {self.URL + self.NAME}!") + + logger.debug("Processing file ...") + processed_df = self._process_single_file(json_file) + data_single_file = processed_df[ + (processed_df.index >= time_interval[0]) & (processed_df.index <= time_interval[1]) + ] + + if len(data_single_file.index) == 0: + continue + + file_path.parent.mkdir(parents=True, exist_ok=True) + data_single_file.to_csv(tmp_path, index=True, header=False) + tmp_path.replace(file_path) + + logger.debug(f"Saving processed file {file_path}") + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + if tmp_path.exists(): + tmp_path.unlink() + continue + + rmtree(temporary_dir, ignore_errors=True) + + def _download(self, temporary_dir, start_time: datetime, end_time: datetime) -> None: + response = requests.get( + self.URL + + f"&dts_start={start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}&dts_end={end_time.strftime('%Y-%m-%dT%H:%M:%SZ')}" + ) + response.raise_for_status() + + with open(temporary_dir / self.NAME, "w") as f: + f.write(response.text) + + def read( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, download: bool = False + ) -> pd.DataFrame: + """Read SIDC Kp data for the specified time range. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to read. + end_time : Optional[datetime] + End time of the data to read. + download : bool, optional + Download data on the go, defaults to False. + + Returns + ------- + :class:`pandas.DataFrame` + SIDC Kp dataframe. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + + # initialize data frame with NaNs + t = pd.date_range( + datetime(start_time.year, start_time.month, start_time.day), + datetime(end_time.year, end_time.month, end_time.day, 23, 59, 59), + freq=timedelta(hours=3), + ) + data_out = pd.DataFrame(index=t) + data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out["kp"] = np.array([np.nan] * len(t)) + data_out["file_name"] = np.array([None] * len(t)) + + for file_path, time_interval in zip(file_paths, time_intervals): + if not file_path.exists(): + if download: + self.download_and_process(start_time, end_time) + + # if we request a date in the future, the file will still not be found here + if not file_path.exists(): + warnings.warn(f"File {file_path} not found") + continue + df_one_file = self._read_single_file(file_path) + + # combine the new file with the old ones, replace all values present in df_one_file in data_out + data_out = df_one_file.combine_first(data_out) + + data_out = data_out.truncate( + before=start_time - timedelta(hours=2.9999), + after=end_time + timedelta(hours=2.9999), + ) + + return data_out + + def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> Tuple[List, List]: + """Get list of file paths and their corresponding time intervals for monthly files. + + Returns + ------- + Tuple[List, List] + List of file paths and tuples containing (start_time, end_time) for each month. + """ + file_paths = [] + time_intervals = [] + + # Start from the first day of the month containing start_time + current_time = datetime( + start_time.year, + start_time.month, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + + # End at the last day of the month containing end_time + if end_time.month == 12: + end_of_period = datetime(end_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + end_of_period = datetime(end_time.year, end_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + while current_time < end_of_period: + file_path = ( + self.data_dir / current_time.strftime("%Y/%m") / f"SIDC_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" + ) + file_paths.append(file_path) + + # Create interval covering the entire month + month_start = current_time + if current_time.month == 12: + month_end = datetime( + current_time.year + 1, + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + else: + month_end = datetime( + current_time.year, + current_time.month + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + month_end -= timedelta(seconds=1) # Set to the last second of the month + time_intervals.append((month_start, month_end)) + + # Move to next month + if current_time.month == 12: + current_time = datetime(current_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + current_time = datetime(current_time.year, current_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + return file_paths, time_intervals + + def _read_single_file(self, file_path) -> pd.DataFrame: + """Read SIDC Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Data from SIDC Kp file. + """ + df = pd.read_csv(file_path, names=["t", "kp"]) + + df["t"] = pd.to_datetime(df["t"]) + df.index = df["t"] + df.drop(labels=["t"], axis=1, inplace=True) + df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + + df["file_name"] = file_path + df.loc[df["kp"].isna(), "file_name"] = None + + return df + + def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: + """Process SIDC Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + SIDC Kp data. + """ + + data = pd.read_json(temporary_dir) + data = pd.json_normalize(data["data"]) + + data = data.apply(lambda x: pd.to_datetime(x) if x.name != "value" else x) + + data.sort_values(["issue_time", "start_time"], inplace=True) + + data.drop(columns=["end_time"], inplace=True) + + data = data.loc[data.groupby("start_time")["issue_time"].idxmax(), ["start_time", "value"]] + data.rename(columns={"start_time": "t", "value": "Kp"}, inplace=True) + data.index = data["t"] + data.index = enforce_utc_timezone(data.index) + + return data From 9085fde976462788021af16829c1dfc7a5820226 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 10:28:01 +0100 Subject: [PATCH 02/13] chore: addr copilot comments chore: addr copilot comments (part 2) --- swvo/io/kp/sidc.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py index 96561d5..ef202f0 100755 --- a/swvo/io/kp/sidc.py +++ b/swvo/io/kp/sidc.py @@ -50,7 +50,6 @@ class KpSIDC(BaseIO): URL = "https://ssa.sidc.be/prod/API/index.php?component=latest&pc=G158&psc=a" NAME = "kp.json" - DAYS_TO_SAVE_EACH_FILE = 3 LABEL = "sidc" def download_and_process( @@ -90,17 +89,16 @@ def download_and_process( logger.debug(f"Downloading file {self.URL} ...") file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) - for file_path, time_interval in zip(file_paths, time_intervals): + for num, (file_path, time_interval) in enumerate(zip(file_paths, time_intervals)): if file_path.exists() and not reprocess_files: continue tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") try: - self._download(temporary_dir, start_time, end_time) - # check if download was successfull - json_file = temporary_dir / self.NAME + json_file = temporary_dir / f"{self.NAME}" + self._download(json_file, time_interval[0], time_interval[1]) if not json_file.exists() or json_file.stat().st_size == 0: - raise FileNotFoundError(f"Error while downloading file: {self.URL + self.NAME}!") + raise FileNotFoundError(f"Error while downloading file: {self.URL}!") logger.debug("Processing file ...") processed_df = self._process_single_file(json_file) @@ -127,11 +125,11 @@ def download_and_process( def _download(self, temporary_dir, start_time: datetime, end_time: datetime) -> None: response = requests.get( self.URL - + f"&dts_start={start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}&dts_end={end_time.strftime('%Y-%m-%dT%H:%M:%SZ')}" + + f"&dts_start={start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}&dts_end={(end_time + timedelta(seconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ')}" ) response.raise_for_status() - with open(temporary_dir / self.NAME, "w") as f: + with open(temporary_dir, "w") as f: f.write(response.text) def read( @@ -232,7 +230,7 @@ def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> while current_time < end_of_period: file_path = ( - self.data_dir / current_time.strftime("%Y/%m") / f"SIDC_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" + self.data_dir / current_time.strftime("%Y") / f"SIDC_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" ) file_paths.append(file_path) @@ -321,5 +319,6 @@ def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: data.rename(columns={"start_time": "t", "value": "Kp"}, inplace=True) data.index = data["t"] data.index = enforce_utc_timezone(data.index) + data.drop(columns=["t"], inplace=True) return data From 4646d4ef052b3d182b3079df11abce65453c4a33 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 10:50:43 +0100 Subject: [PATCH 03/13] chore: enforce utc tz --- swvo/io/kp/sidc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py index ef202f0..fe45e5d 100755 --- a/swvo/io/kp/sidc.py +++ b/swvo/io/kp/sidc.py @@ -78,6 +78,9 @@ def download_and_process( if end_time is None: end_time = datetime.now(timezone.utc) + timedelta(days=2) + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + if start_time >= end_time: msg = "start_time must be before end_time" logger.error(msg) From eeb3be4ca7889e62dbb87a1dfa7d0d45fb43d500 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 10:55:47 +0100 Subject: [PATCH 04/13] chore: change env_var name --- swvo/io/kp/sidc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py index fe45e5d..77061e7 100755 --- a/swvo/io/kp/sidc.py +++ b/swvo/io/kp/sidc.py @@ -45,7 +45,7 @@ class KpSIDC(BaseIO): Returns `ValueError` if necessary environment variable is not set. """ - ENV_VAR_NAME = "RT_KP_SIDC_STREAM_DIR" + ENV_VAR_NAME = "FC_KP_SIDC_STREAM_DIR" URL = "https://ssa.sidc.be/prod/API/index.php?component=latest&pc=G158&psc=a" NAME = "kp.json" From 6e189a4ddb4ce7d5f4a2714cd31d8c41884b4d0d Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 12:02:50 +0100 Subject: [PATCH 05/13] feat(kp): add BGS Kp --- swvo/io/kp/__init__.py | 1 + swvo/io/kp/bgs.py | 340 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+) create mode 100755 swvo/io/kp/bgs.py diff --git a/swvo/io/kp/__init__.py b/swvo/io/kp/__init__.py index 9353b1d..8034a7c 100755 --- a/swvo/io/kp/__init__.py +++ b/swvo/io/kp/__init__.py @@ -7,6 +7,7 @@ from swvo.io.kp.omni import KpOMNI as KpOMNI from swvo.io.kp.swpc import KpSWPC as KpSWPC from swvo.io.kp.sidc import KpSIDC as KpSIDC +from swvo.io.kp.bgs import KpBGS as KpBGS # This has to be imported after the models to avoid a circular import from swvo.io.kp.read_kp_from_multiple_models import read_kp_from_multiple_models as read_kp_from_multiple_models # noqa: I001 diff --git a/swvo/io/kp/bgs.py b/swvo/io/kp/bgs.py new file mode 100755 index 0000000..39c3526 --- /dev/null +++ b/swvo/io/kp/bgs.py @@ -0,0 +1,340 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Module for handling BGS Kp data. +""" + +import logging +from datetime import datetime, timedelta, timezone +from pathlib import Path +from shutil import rmtree +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +import requests +from bs4 import BeautifulSoup + +from swvo.io.base import BaseIO +from swvo.io.utils import enforce_utc_timezone + +logger = logging.getLogger(__name__) + +logging.captureWarnings(True) + + +class KpBGS(BaseIO): + """A class to handle BGS Kp data. + + Parameters + ---------- + data_dir : Path | None + Data directory for the BGS Kp data. If not provided, it will be read from the environment variable + + Methods + ------- + download_and_process + read + + Raises + ------ + ValueError + Returns `ValueError` if necessary environment variable is not set. + """ + + ENV_VAR_NAME = "RT_KP_BGS_STREAM_DIR" + + URL = "https://geomag.bgs.ac.uk/cgi-bin/solar" + NAME = "kp.html" + + LABEL = "bgs" + + def download_and_process(self, request_time: Optional[datetime] = None, reprocess_files: bool = False) -> None: + """Download and process BGS Kp data file for a specific month. + + Parameters + ---------- + request_time : Optional[datetime] + Time for which to download and process data (month and year are extracted). + reprocess_files : bool, optional + Downloads and processes the files again, defaults to False, by default False + + Raises + ------ + FileNotFoundError + Raise `FileNotFoundError` if the file is not downloaded successfully. + """ + + if request_time is None: + request_time = datetime.now(timezone.utc) + + request_time = enforce_utc_timezone(request_time) + + temporary_dir = Path("./temp_kp_bgs_wget") + temporary_dir.mkdir(exist_ok=True, parents=True) + + logger.debug(f"Downloading file {self.URL} ...") + + file_path = self.data_dir / request_time.strftime("%Y") / f"BGS_KP_FORECAST_{request_time.strftime('%Y%m')}.csv" + + if file_path.exists() and not reprocess_files: + return + + tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") + try: + logger.info(f"Downloading file for {request_time.strftime('%Y-%m')} from {self.URL}") + html_file = temporary_dir / f"{self.NAME}" + self._download(html_file, request_time) + if not html_file.exists() or html_file.stat().st_size == 0: + raise FileNotFoundError(f"Error while downloading file: {self.URL}!") + + logger.debug("Processing file ...") + processed_df = self._process_single_file(html_file) + + if len(processed_df.index) == 0: + return + + file_path.parent.mkdir(parents=True, exist_ok=True) + processed_df.to_csv(tmp_path, index=True, header=False) + tmp_path.replace(file_path) + + logger.debug(f"Saving processed file {file_path}") + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + if tmp_path.exists(): + tmp_path.unlink() + return + + rmtree(temporary_dir, ignore_errors=True) + + def _download(self, temporary_dir: Path, request_time: datetime) -> None: + """Download BGS Kp data for a specific month. + + Parameters + ---------- + temporary_dir : Path + Path to save the temporary downloaded file. + request_time : datetime + Time for which to download data (month and year are extracted). + """ + + payload = { + "name": "not given", + "solar_geo": "1", + "month": str(request_time.month), + "year": str(request_time.year), + } + + response = requests.post(self.URL, data=payload) + response.raise_for_status() + + with open(temporary_dir, "w") as f: + f.write(response.text) + + def read( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, download: bool = False + ) -> pd.DataFrame: + """Read BGS Kp data for the specified time range. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to read. + end_time : Optional[datetime] + End time of the data to read. + download : bool, optional + Download data on the go, defaults to False. + + Returns + ------- + :class:`pandas.DataFrame` + BGS Kp dataframe. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + logger.info(f"Reading data from {start_time} to {end_time}") + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + + # Download data for every month if download is True + if download: + for time_interval in time_intervals: + self.download_and_process(time_interval[0]) + + # initialize data frame with NaNs + t = pd.date_range( + datetime(start_time.year, start_time.month, start_time.day), + datetime(end_time.year, end_time.month, end_time.day, 23, 59, 59), + freq=timedelta(hours=3), + ) + data_out = pd.DataFrame(index=t) + data_out.index = enforce_utc_timezone(data_out.index) + data_out["kp"] = np.array([np.nan] * len(t)) + data_out["file_name"] = np.array([None] * len(t)) + + for file_path, time_interval in zip(file_paths, time_intervals): + if not file_path.exists(): + logger.warning(f"File {file_path} not found") + logger.warning(f"Data not available from {time_interval[0]} to {time_interval[1]}") + continue + logger.info(f"Reading data from {file_path}") + df_one_file = self._read_single_file(file_path) + + # combine the new file with the old ones, replace all values present in df_one_file in data_out + data_out = df_one_file.combine_first(data_out) + + data_out = data_out.truncate( + before=start_time - timedelta(hours=2.9999), + after=end_time + timedelta(hours=2.9999), + ) + + return data_out + + def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> Tuple[List, List]: + """Get list of file paths and their corresponding time intervals for monthly files. + + Returns + ------- + Tuple[List, List] + List of file paths and tuples containing (start_time, end_time) for each month. + """ + file_paths = [] + time_intervals = [] + + current_time = datetime( + start_time.year, + start_time.month, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + + if end_time.month == 12: + end_of_period = datetime(end_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + end_of_period = datetime(end_time.year, end_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + while current_time < end_of_period: + file_path = ( + self.data_dir / current_time.strftime("%Y") / f"BGS_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" + ) + file_paths.append(file_path) + + month_start = current_time + if current_time.month == 12: + month_end = datetime( + current_time.year + 1, + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + else: + month_end = datetime( + current_time.year, + current_time.month + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + month_end -= timedelta(seconds=1) + time_intervals.append((month_start, month_end)) + + if current_time.month == 12: + current_time = datetime(current_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + current_time = datetime(current_time.year, current_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + return file_paths, time_intervals + + def _read_single_file(self, file_path) -> pd.DataFrame: + """Read BGS Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Data from BGS Kp file. + """ + df = pd.read_csv(file_path, names=["t", "kp"]) + + df["t"] = pd.to_datetime(df["t"]) + df.index = df["t"] + df.drop(labels=["t"], axis=1, inplace=True) + df.index = enforce_utc_timezone(df.index) + + df["file_name"] = file_path + df.loc[df["kp"].isna(), "file_name"] = None + + return df + + def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: + """Process BGS Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + BGS Kp data. + """ + with open(temporary_dir, "r") as f: + content = f.read() + soup = BeautifulSoup(content, "html.parser") + table = soup.find("table") + + if not table: + msg = f"No table found in response from {self.URL}" + logger.error(msg) + raise ValueError(msg) + + rows = table.find_all("tr") # ty: ignore[unresolved-attribute] + records = [] + + for row in rows[1:]: # skip header row + cols = row.find_all("td") + if len(cols) >= 9: + date = pd.to_datetime(cols[0].text.strip(), format="%d-%m-%y") + + for i, hour in enumerate(range(0, 24, 3)): + kp = self._snap_to_kp_scale(int(cols[i + 1].text.strip()) / 10) + timestamp = date + pd.Timedelta(hours=hour) + records.append({"t": timestamp, "Kp": kp}) + + df = pd.DataFrame(records).set_index("t") + return df + + def _snap_to_kp_scale(self, value: float) -> float: + KP_SCALE = np.linspace(0, 9, 28) + + idx = np.argmin(np.abs(KP_SCALE - value)) + return round(float(KP_SCALE[idx]), 3) From e692af5cdec6a412cff2922f130961bb96070722 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 12:03:31 +0100 Subject: [PATCH 06/13] feat: add pd.Index overload --- swvo/io/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/swvo/io/utils.py b/swvo/io/utils.py index f0416be..b59c089 100644 --- a/swvo/io/utils.py +++ b/swvo/io/utils.py @@ -276,7 +276,11 @@ def enforce_utc_timezone(time: pd.Series) -> pd.Series: ... def enforce_utc_timezone(time: pd.DatetimeIndex) -> pd.DatetimeIndex: ... -def enforce_utc_timezone(time: datetime | list[datetime] | pd.Timestamp | pd.Series | pd.DatetimeIndex): +@overload +def enforce_utc_timezone(time: pd.Index) -> pd.Index: ... + + +def enforce_utc_timezone(time: datetime | list[datetime] | pd.Timestamp | pd.Series | pd.DatetimeIndex | pd.Index): """ Ensure datetime object(s) have UTC timezone information. From 83ea5f93b980b5c8c45b4f479f8f224793067152 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 12:12:38 +0100 Subject: [PATCH 07/13] chore: add bs4 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3a2112c..046399e 100755 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ tqdm>=4.67.3 wget>=3.2 ty ruff +beautifulsoup4 From 0b2dc828ce9ac8081880da2e3fbbf32bb24298ed Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 12:13:01 +0100 Subject: [PATCH 08/13] chore: remove unused ty commands --- swvo/io/RBMDataSet/RBMDataSet.py | 2 +- swvo/io/f10_7/swpc.py | 2 +- swvo/io/hp/gfz.py | 3 +-- swvo/io/kp/bgs.py | 2 +- swvo/io/kp/ensemble.py | 4 ++-- swvo/io/kp/niemegk.py | 6 +++--- swvo/io/kp/sidc.py | 4 ++-- swvo/io/kp/swpc.py | 2 +- 8 files changed, 12 insertions(+), 13 deletions(-) diff --git a/swvo/io/RBMDataSet/RBMDataSet.py b/swvo/io/RBMDataSet/RBMDataSet.py index 670934a..5955e99 100644 --- a/swvo/io/RBMDataSet/RBMDataSet.py +++ b/swvo/io/RBMDataSet/RBMDataSet.py @@ -464,7 +464,7 @@ def get_loaded_variables(self) -> list[str]: loaded_vars.append(var.var_name) return loaded_vars - def __eq__(self, other: RBMDataSet) -> bool: # type :ignore[override] + def __eq__(self, other: RBMDataSet) -> bool: # ty :ignore[invalid-method-override] if ( self._file_loading_mode != other._file_loading_mode or self._satellite != other._satellite diff --git a/swvo/io/f10_7/swpc.py b/swvo/io/f10_7/swpc.py index 4428a60..227b8ae 100644 --- a/swvo/io/f10_7/swpc.py +++ b/swvo/io/f10_7/swpc.py @@ -255,7 +255,7 @@ def read(self, start_time: datetime, end_time: datetime, *, download: bool = Fal data_out = df_one_file.combine_first(data_out) if not data_out.empty: - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out.drop("date", axis=1, inplace=True) data_out = data_out.truncate( before=start_time - timedelta(hours=23.9999), diff --git a/swvo/io/hp/gfz.py b/swvo/io/hp/gfz.py index 9338e99..7a4173f 100755 --- a/swvo/io/hp/gfz.py +++ b/swvo/io/hp/gfz.py @@ -305,8 +305,7 @@ def _process_single_file(self, temp_dir: Path, start_time: datetime) -> pd.DataF {f"Hp{self.index_number}": json_data[f"Hp{self.index_number}"]}, index=pd.to_datetime(json_data["datetime"], utc=True), ) - data_total.index = enforce_utc_timezone(data_total.index) # ty: ignore[no-matching-overload] - + data_total.index = enforce_utc_timezone(data_total.index) return data_total def _read_single_file(self, file_path: str) -> pd.DataFrame: diff --git a/swvo/io/kp/bgs.py b/swvo/io/kp/bgs.py index 39c3526..4cf639e 100755 --- a/swvo/io/kp/bgs.py +++ b/swvo/io/kp/bgs.py @@ -317,7 +317,7 @@ def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: logger.error(msg) raise ValueError(msg) - rows = table.find_all("tr") # ty: ignore[unresolved-attribute] + rows = table.find_all("tr") records = [] for row in rows[1:]: # skip header row diff --git a/swvo/io/kp/ensemble.py b/swvo/io/kp/ensemble.py index a42370a..f173189 100755 --- a/swvo/io/kp/ensemble.py +++ b/swvo/io/kp/ensemble.py @@ -122,7 +122,7 @@ def read(self, start_time: datetime, end_time: datetime) -> list[pd.DataFrame]: freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out = data_out.truncate( before=start_time - timedelta(hours=2.9999), @@ -144,7 +144,7 @@ def read(self, start_time: datetime, end_time: datetime) -> list[pd.DataFrame]: df["file_name"] = file df.loc[df["kp"].isna(), "file_name"] = None - df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + df.index = enforce_utc_timezone(df.index) df = df.truncate( before=start_time - timedelta(hours=2.9999), diff --git a/swvo/io/kp/niemegk.py b/swvo/io/kp/niemegk.py index 5dd13db..d2849d5 100755 --- a/swvo/io/kp/niemegk.py +++ b/swvo/io/kp/niemegk.py @@ -156,7 +156,7 @@ def read(self, start_time: datetime, end_time: datetime, download: bool = False) freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out["file_name"] = np.array([None] * len(t)) @@ -247,7 +247,7 @@ def _read_single_file(self, file_path) -> pd.DataFrame: df["t"] = pd.to_datetime(df["t"]) df.index = df["t"] df.drop(labels=["t"], axis=1, inplace=True) - df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + df.index = enforce_utc_timezone(df.index) df["file_name"] = file_path df.loc[df["kp"].isna(), "file_name"] = None @@ -294,7 +294,7 @@ def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: ) data.index.rename("t", inplace=True) data.index = data["t"] - data.index = enforce_utc_timezone(data.index) # ty: ignore[no-matching-overload] + data.index = enforce_utc_timezone(data.index) data.drop(labels=["t"], axis=1, inplace=True) data.dropna(inplace=True) data = data[data["kp"] != -1.0] diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py index 77061e7..2810f7b 100755 --- a/swvo/io/kp/sidc.py +++ b/swvo/io/kp/sidc.py @@ -178,7 +178,7 @@ def read( freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out["file_name"] = np.array([None] * len(t)) @@ -288,7 +288,7 @@ def _read_single_file(self, file_path) -> pd.DataFrame: df["t"] = pd.to_datetime(df["t"]) df.index = df["t"] df.drop(labels=["t"], axis=1, inplace=True) - df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + df.index = enforce_utc_timezone(df.index) df["file_name"] = file_path df.loc[df["kp"].isna(), "file_name"] = None diff --git a/swvo/io/kp/swpc.py b/swvo/io/kp/swpc.py index 85a8cd2..66edb7f 100755 --- a/swvo/io/kp/swpc.py +++ b/swvo/io/kp/swpc.py @@ -174,7 +174,7 @@ def read(self, start_time: datetime, end_time: Optional[datetime] = None, downlo freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out["file_name"] = np.array([np.nan] * len(t)) From b1c8b44dc81fd167f867d2b7a7d98dc26729957e Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 13:01:45 +0100 Subject: [PATCH 09/13] test: skip download --- tests/io/sme/test_sme_supermag.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/io/sme/test_sme_supermag.py b/tests/io/sme/test_sme_supermag.py index 9323632..bb4f2ee 100644 --- a/tests/io/sme/test_sme_supermag.py +++ b/tests/io/sme/test_sme_supermag.py @@ -69,6 +69,7 @@ def test_get_processed_file_list(self, sme_instance): assert all(path.name.startswith("SuperMAG_SME_") for path in file_paths) assert len(time_intervals) == 32 + @pytest.mark.skip(reason="SME servers are down") def test_download_and_process(self, sme_instance): sme_instance.download_and_process(datetime(2020, 1, 1), datetime(2020, 1, 2)) From 9eeb0aa8f43d1148311d54c18dd35f13433cf239 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 13:02:02 +0100 Subject: [PATCH 10/13] test: add sidc tests --- tests/io/kp/test_kp_sidc.py | 214 ++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 tests/io/kp/test_kp_sidc.py diff --git a/tests/io/kp/test_kp_sidc.py b/tests/io/kp/test_kp_sidc.py new file mode 100644 index 0000000..496d4b5 --- /dev/null +++ b/tests/io/kp/test_kp_sidc.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: 2025 GFZ Helmholtz Centre for Geosciences +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from swvo.io.kp import KpSIDC + +TEST_DIR = Path("test_data") +DATA_DIR = TEST_DIR / "mock_kp_sidc" + + +class TestKpSIDC: + @pytest.fixture(scope="session", autouse=True) + def setup_and_cleanup(self): + TEST_DIR.mkdir(exist_ok=True) + DATA_DIR.mkdir(exist_ok=True) + + yield + + if TEST_DIR.exists(): + shutil.rmtree(TEST_DIR, ignore_errors=True) + + @pytest.fixture + def kp_sidc_instance(self): + return KpSIDC(data_dir=DATA_DIR) + + def test_initialization_with_data_dir(self): + instance = KpSIDC(data_dir=DATA_DIR) + assert instance.data_dir == DATA_DIR + assert instance.data_dir.exists() + + def test_initialization_without_env_var(self): + if KpSIDC.ENV_VAR_NAME in os.environ: + del os.environ[KpSIDC.ENV_VAR_NAME] + with pytest.raises( + ValueError, + match=f"Necessary environment variable {KpSIDC.ENV_VAR_NAME} not set!", + ): + KpSIDC() + + def test_initialization_with_env_var(self): + os.environ[KpSIDC.ENV_VAR_NAME] = str(DATA_DIR) + instance = KpSIDC() + assert instance.data_dir == DATA_DIR + + def test_get_processed_file_list(self): + instance = KpSIDC(data_dir=DATA_DIR) + start_time = datetime(2024, 1, 1) + end_time = datetime(2024, 5, 3) + + file_paths, time_intervals = instance._get_processed_file_list(start_time, end_time) + + assert len(file_paths) == 5 + assert len(time_intervals) == 5 + + assert file_paths[0].name == "SIDC_KP_FORECAST_202401.csv" + assert file_paths[-1].name == "SIDC_KP_FORECAST_202405.csv" + + assert time_intervals[0][0] == datetime(2024, 1, 1, tzinfo=timezone.utc) + assert time_intervals[0][1] == datetime(2024, 1, 31, 23, 59, 59, tzinfo=timezone.utc) + + def test_download_and_process_current_month(self, kp_sidc_instance): + current_time = datetime.now(timezone.utc) + end_time = current_time + timedelta(days=2) + + kp_sidc_instance.download_and_process(current_time, end_time, reprocess_files=True) + + file_paths, _ = kp_sidc_instance._get_processed_file_list(current_time, end_time) + + for file_path in file_paths: + if file_path.exists(): + df = pd.read_csv(file_path, names=["t", "kp"]) + assert len(df) > 0 + assert "t" in df.columns + assert "kp" in df.columns + + valid_kps = df["kp"].dropna() + assert valid_kps.min() >= 0 + assert valid_kps.max() <= 9 + + def test_download_past_month(self, kp_sidc_instance): + past_time = datetime.now(timezone.utc) - timedelta(days=32) + end_time = past_time + timedelta(days=2) + + kp_sidc_instance.download_and_process(past_time, end_time) + + file_paths, _ = kp_sidc_instance._get_processed_file_list(past_time, end_time) + + for file_path in file_paths: + assert file_path.exists() + + def test_read_with_download(self, kp_sidc_instance): + current_time = datetime.now() + end_time = current_time + timedelta(days=1) + + data = kp_sidc_instance.read(current_time, end_time, download=True) + + assert isinstance(data, pd.DataFrame) + assert "kp" in data.columns + assert "file_name" in data.columns + assert isinstance(data.index, pd.DatetimeIndex) + + assert data.index[0] >= current_time.replace(tzinfo=timezone.utc) - timedelta(hours=3) + assert data.index[-1] <= end_time.replace(tzinfo=timezone.utc) + timedelta(hours=3) + + def test_read_without_download_no_file(self, kp_sidc_instance): + current_time = datetime.now() + end_time = current_time + timedelta(days=1) + + if DATA_DIR.exists(): + for file in DATA_DIR.glob("**/SIDC_KP_FORECAST_*.csv"): + file.unlink() + + data = kp_sidc_instance.read(current_time, end_time, download=False) + assert isinstance(data, pd.DataFrame) + assert data["kp"].isna().all() + + def test_process_single_file(self): + instance = KpSIDC(data_dir=DATA_DIR) + temp_dir = Path("./temp_test") + temp_dir.mkdir(exist_ok=True) + + try: + sample_data = """[ + { + "data": { + "end_time": "2024-01-31T00:00:00Z", + "issue_time": "2024-01-30T12:39:59Z", + "start_time": "2024-01-30T21:00:00Z", + "value": 2 + } + }, + { + "data": { + "end_time": "2024-01-31T00:00:00Z", + "issue_time": "2024-01-29T12:34:56Z", + "start_time": "2024-01-30T21:00:00Z", + "value": 2 + } + }, + { + "data": { + "end_time": "2024-01-31T00:00:00Z", + "issue_time": "2024-01-28T12:30:03Z", + "start_time": "2024-01-30T21:00:00Z", + "value": 3 + } + }, + { + "data": { + "end_time": "2024-01-30T21:00:00Z", + "issue_time": "2024-01-30T12:39:59Z", + "start_time": "2024-01-30T18:00:00Z", + "value": 6 + } + }, + { + "data": { + "end_time": "2024-01-30T21:00:00Z", + "issue_time": "2024-01-29T12:34:56Z", + "start_time": "2024-01-30T18:00:00Z", + "value": 3 + } + }]""" + + temp_file = temp_dir / "kp.json" + with open(temp_file, "w") as f: + f.write(sample_data) + + df = instance._process_single_file(temp_file) + + assert isinstance(df, pd.DataFrame) + assert "Kp" in df.columns + assert len(df) == 2 + + expected_values = [ + 6, + 2, + ] + + for actual, expected in zip(df["Kp"].values, expected_values): + assert np.isclose(actual, expected, atol=0.0) + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_reprocess_files_flag(self, kp_sidc_instance): + current_time = datetime.now(timezone.utc) + end_time = current_time + timedelta(days=1) + + kp_sidc_instance.download_and_process(current_time, end_time, reprocess_files=True) + + file_paths, _ = kp_sidc_instance._get_processed_file_list(current_time, end_time) + + initial_data = None + for file_path in file_paths: + if file_path.exists(): + initial_data = pd.read_csv(file_path, names=["t", "kp"]) + break + + assert initial_data is not None + + kp_sidc_instance.download_and_process(current_time, end_time, reprocess_files=False) + + unchanged_data = pd.read_csv(file_path, names=["t", "kp"]) + pd.testing.assert_frame_equal(initial_data, unchanged_data) From 383cac22c212570926b3b6776f205c877b0e0e6d Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 13:14:08 +0100 Subject: [PATCH 11/13] test: add bgs tests --- tests/io/kp/data/kp.html | 254 +++++++++++++++++++++++++++++++++++++ tests/io/kp/test_kp_bgs.py | 160 +++++++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 tests/io/kp/data/kp.html create mode 100644 tests/io/kp/test_kp_bgs.py diff --git a/tests/io/kp/data/kp.html b/tests/io/kp/data/kp.html new file mode 100644 index 0000000..a9551c4 --- /dev/null +++ b/tests/io/kp/data/kp.html @@ -0,0 +1,254 @@ + + + + + + +Results of Geomagnetic Data query + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + +
+
+

Results of Geomagnetic Data query

+
+

Geomagnetic Data

DateKp: Planetary 3Hr Range IndexSum Kpap: Planetary Equiv. AmplitudeApCpC9
01-01-254053506067806340453 27 56 48 80111207 94 27 811.87
02-01-253320334743373330277 18 7 18 39 32 22 18 15 211.15
03-01-2530333320 7131017163 15 18 18 7 3 5 4 6 100.52
04-01-252047504333435330320 7 39 48 32 18 32 56 15 311.36
05-01-253323233330434027253 18 9 9 18 15 32 27 12 181.05
06-01-253037273027302323227 15 22 12 15 12 15 9 9 140.84
07-01-252327232723332020197 9 12 9 12 9 18 7 7 100.63
08-01-25372317 0102310 7127 22 9 6 0 4 9 4 3 70.42
09-01-252010172037271713160 7 4 6 7 22 12 6 5 90.52
10-01-251033202730132340197 4 18 7 12 15 5 9 27 120.73
11-01-2523101310173013 7123 9 4 5 4 6 15 5 3 60.31
12-01-251317171723131317130 5 6 6 6 9 5 5 6 60.31
13-01-254023102320232320183 27 9 4 9 7 9 9 7 100.63
14-01-252723232020273027197 12 9 9 7 7 12 15 12 100.63
15-01-253337372730332717240 18 22 22 12 15 18 12 6 160.94
16-01-252733172330303727223 12 18 6 9 15 15 22 12 140.84
17-01-253340373033304040283 18 27 22 15 18 15 27 27 211.15
18-01-252020232023202020167 7 7 9 7 9 7 7 7 80.42
19-01-251023372733303037227 4 9 22 12 18 15 15 22 150.84
20-01-253733303043302737267 22 18 15 15 32 15 12 22 191.05
21-01-252330201710273023180 9 15 7 6 4 12 15 9 100.52
22-01-252720302317131030170 12 7 15 9 6 5 4 15 90.52
23-01-253317 71013373723177 18 6 3 4 5 22 22 9 110.63
24-01-251313231320101023127 5 5 9 5 7 4 4 9 60.31
25-01-25 3 0 3 3 710 0 7 33 2 0 2 2 3 4 0 3 20.00
26-01-25 3 7 3 3 3 0 0 3 23 2 3 2 2 2 0 0 2 20.00
27-01-25 0 0131713272020110 0 0 5 6 5 12 7 7 50.21
28-01-252013 72327273727180 7 5 3 9 12 12 22 12 100.63
29-01-252010102020171313123 7 4 4 7 7 6 5 5 60.31
30-01-25 7 7 7 3 7132027 90 3 3 3 2 3 5 7 12 50.21
31-01-2517 7101720172027133 6 3 4 6 7 6 7 12 60.31
+
+
+ + + + +
+
+ + + diff --git a/tests/io/kp/test_kp_bgs.py b/tests/io/kp/test_kp_bgs.py new file mode 100644 index 0000000..3fe10f2 --- /dev/null +++ b/tests/io/kp/test_kp_bgs.py @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: 2025 GFZ Helmholtz Centre for Geosciences +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from swvo.io.kp import KpBGS + +TEST_DIR = Path("test_data") +DATA_DIR = TEST_DIR / "mock_kp_bgs" + + +class TestKpBGS: + @pytest.fixture(scope="session", autouse=True) + def setup_and_cleanup(self): + TEST_DIR.mkdir(exist_ok=True) + DATA_DIR.mkdir(exist_ok=True) + + yield + + if TEST_DIR.exists(): + shutil.rmtree(TEST_DIR, ignore_errors=True) + + @pytest.fixture + def kp_bgs_instance(self): + return KpBGS(data_dir=DATA_DIR) + + def test_initialization_with_data_dir(self): + instance = KpBGS(data_dir=DATA_DIR) + assert instance.data_dir == DATA_DIR + assert instance.data_dir.exists() + + def test_initialization_without_env_var(self): + if KpBGS.ENV_VAR_NAME in os.environ: + del os.environ[KpBGS.ENV_VAR_NAME] + with pytest.raises( + ValueError, + match=f"Necessary environment variable {KpBGS.ENV_VAR_NAME} not set!", + ): + KpBGS() + + def test_initialization_with_env_var(self): + os.environ[KpBGS.ENV_VAR_NAME] = str(DATA_DIR) + instance = KpBGS() + assert instance.data_dir == DATA_DIR + + def test_get_processed_file_list(self): + instance = KpBGS(data_dir=DATA_DIR) + start_time = datetime(2024, 1, 1) + end_time = datetime(2024, 5, 3) + + file_paths, time_intervals = instance._get_processed_file_list(start_time, end_time) + + assert len(file_paths) == 5 + assert len(time_intervals) == 5 + + assert file_paths[0].name == "BGS_KP_FORECAST_202401.csv" + assert file_paths[-1].name == "BGS_KP_FORECAST_202405.csv" + + assert time_intervals[0][0] == datetime(2024, 1, 1, tzinfo=timezone.utc) + assert time_intervals[0][1] == datetime(2024, 1, 31, 23, 59, 59, tzinfo=timezone.utc) + + def test_download_and_process_current_month(self, kp_bgs_instance): + current_time = datetime.now(timezone.utc) + end_time = current_time + timedelta(days=2) + + kp_bgs_instance.download_and_process(current_time, reprocess_files=True) + + file_paths, _ = kp_bgs_instance._get_processed_file_list(current_time, end_time) + + for file_path in file_paths: + if file_path.exists(): + df = pd.read_csv(file_path, names=["t", "kp"]) + assert len(df) > 0 + assert "t" in df.columns + assert "kp" in df.columns + + valid_kps = df["kp"].dropna() + assert valid_kps.min() >= 0 + assert valid_kps.max() <= 9 + + def test_download_past_month(self, kp_bgs_instance): + past_time = datetime.now(timezone.utc) - timedelta(days=32) + end_time = past_time + timedelta(days=2) + + kp_bgs_instance.download_and_process(past_time) + + file_paths, _ = kp_bgs_instance._get_processed_file_list(past_time, end_time) + + for file_path in file_paths: + assert not file_path.exists() # Files are only available for 2 months past, so these should not exist + + def test_read_with_download(self, kp_bgs_instance): + current_time = datetime.now() + end_time = current_time + timedelta(days=1) + + data = kp_bgs_instance.read(current_time, end_time, download=True) + + assert isinstance(data, pd.DataFrame) + assert "kp" in data.columns + assert "file_name" in data.columns + assert isinstance(data.index, pd.DatetimeIndex) + + assert data.index[0] >= current_time.replace(tzinfo=timezone.utc) - timedelta(hours=3) + assert data.index[-1] <= end_time.replace(tzinfo=timezone.utc) + timedelta(hours=3) + + def test_read_without_download_no_file(self, kp_bgs_instance): + current_time = datetime.now() + end_time = current_time + timedelta(days=1) + + if DATA_DIR.exists(): + for file in DATA_DIR.glob("**/BGS_KP_FORECAST_*.csv"): + file.unlink() + + data = kp_bgs_instance.read(current_time, end_time, download=False) + assert isinstance(data, pd.DataFrame) + assert data["kp"].isna().all() + + def test_process_single_file(self): + instance = KpBGS(data_dir=DATA_DIR) + df = instance._process_single_file("tests/io/kp/data/kp.html") + + assert isinstance(df, pd.DataFrame) + assert "Kp" in df.columns + assert len(df) == 248 + + # fmt: off + expected_values = [4.0, 5.333, 5.0, 6.0, 6.667, 8.0, 6.333, 4.0, 3.333, 2.0, 3.333, 4.667, 4.333, 3.667, 3.333, 3.0, 3.0, 3.333, 3.333, 2.0, 0.667, 1.333, 1.0, 1.667, 2.0, 4.667, 5.0, 4.333, 3.333, 4.333, 5.333, 3.0, 3.333, 2.333, 2.333, 3.333, 3.0, 4.333, 4.0, 2.667, 3.0, 3.667, 2.667, 3.0, 2.667, 3.0, 2.333, 2.333, 2.333, 2.667, 2.333, 2.667, 2.333, 3.333, 2.0, 2.0, 3.667, 2.333, 1.667, 0.0, 1.0, 2.333, 1.0, 0.667, 2.0, 1.0, 1.667, 2.0, 3.667, 2.667, 1.667, 1.333, 1.0, 3.333, 2.0, 2.667, 3.0, 1.333, 2.333, 4.0, 2.333, 1.0, 1.333, 1.0, 1.667, 3.0, 1.333, 0.667, 1.333, 1.667, 1.667, 1.667, 2.333, 1.333, 1.333, 1.667, 4.0, 2.333, 1.0, 2.333, 2.0, 2.333, 2.333, 2.0, 2.667, 2.333, 2.333, 2.0, 2.0, 2.667, 3.0, 2.667, 3.333, 3.667, 3.667, 2.667, 3.0, 3.333, 2.667, 1.667, 2.667, 3.333, 1.667, 2.333, 3.0, 3.0, 3.667, 2.667, 3.333, 4.0, 3.667, 3.0, 3.333, 3.0, 4.0, 4.0, 2.0, 2.0, 2.333, 2.0, 2.333, 2.0, 2.0, 2.0, 1.0, 2.333, 3.667, 2.667, 3.333, 3.0, 3.0, 3.667, 3.667, 3.333, 3.0, 3.0, 4.333, 3.0, 2.667, 3.667, 2.333, 3.0, 2.0, 1.667, 1.0, 2.667, 3.0, 2.333, 2.667, 2.0, 3.0, 2.333, 1.667, 1.333, 1.0, 3.0, 3.333, 1.667, 0.667, 1.0, 1.333, 3.667, 3.667, 2.333, 1.333, 1.333, 2.333, 1.333, 2.0, 1.0, 1.0, 2.333, 0.333, 0.0, 0.333, 0.333, 0.667, 1.0, 0.0, 0.667, 0.333, 0.667, 0.333, 0.333, 0.333, 0.0, 0.0, 0.333, 0.0, 0.0, 1.333, 1.667, 1.333, 2.667, 2.0, 2.0, 2.0, 1.333, 0.667, 2.333, 2.667, 2.667, 3.667, 2.667, 2.0, 1.0, 1.0, 2.0, 2.0, 1.667, 1.333, 1.333, 0.667, 0.667, 0.667, 0.333, 0.667, 1.333, 2.0, 2.667, 1.667] + # fmt: on + + for actual, expected in zip(df["Kp"].values, expected_values): + assert np.isclose(actual, expected, atol=0.0) + + def test_reprocess_files_flag(self, kp_bgs_instance): + current_time = datetime(2026, 1, 1, tzinfo=timezone.utc) + end_time = current_time + timedelta(days=60) + + kp_bgs_instance.download_and_process(current_time, reprocess_files=True) + + file_paths, _ = kp_bgs_instance._get_processed_file_list(current_time, end_time) + + initial_data = None + for file_path in file_paths: + if file_path.exists(): + initial_data = pd.read_csv(file_path, names=["t", "kp"]) + break + + assert initial_data is not None + + kp_bgs_instance.download_and_process(current_time, reprocess_files=False) + + unchanged_data = pd.read_csv(file_path, names=["t", "kp"]) + pd.testing.assert_frame_equal(initial_data, unchanged_data) From aac9a7f3d2ec36b6e1adb7bdb8cc25d4571db97d Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 13:22:20 +0100 Subject: [PATCH 12/13] chore: add license header --- tests/io/kp/data/kp.html | 7 +++++++ tests/io/kp/test_kp_bgs.py | 3 ++- tests/io/kp/test_kp_sidc.py | 3 ++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/io/kp/data/kp.html b/tests/io/kp/data/kp.html index a9551c4..1da0018 100644 --- a/tests/io/kp/data/kp.html +++ b/tests/io/kp/data/kp.html @@ -1,3 +1,10 @@ + + diff --git a/tests/io/kp/test_kp_bgs.py b/tests/io/kp/test_kp_bgs.py index 3fe10f2..123d6c9 100644 --- a/tests/io/kp/test_kp_bgs.py +++ b/tests/io/kp/test_kp_bgs.py @@ -1,4 +1,5 @@ -# SPDX-FileCopyrightText: 2025 GFZ Helmholtz Centre for Geosciences +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/io/kp/test_kp_sidc.py b/tests/io/kp/test_kp_sidc.py index 496d4b5..b8544bc 100644 --- a/tests/io/kp/test_kp_sidc.py +++ b/tests/io/kp/test_kp_sidc.py @@ -1,4 +1,5 @@ -# SPDX-FileCopyrightText: 2025 GFZ Helmholtz Centre for Geosciences +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar # # SPDX-License-Identifier: Apache-2.0 From 13a943ebb3b865b2c88d0ba95a90a11aeebc3291 Mon Sep 17 00:00:00 2001 From: Sahil Jhawar Date: Thu, 26 Mar 2026 13:41:02 +0100 Subject: [PATCH 13/13] feat: add BGS and SIDC in multiple model reader --- swvo/io/kp/read_kp_from_multiple_models.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/swvo/io/kp/read_kp_from_multiple_models.py b/swvo/io/kp/read_kp_from_multiple_models.py index 0ba8e34..8d08cee 100644 --- a/swvo/io/kp/read_kp_from_multiple_models.py +++ b/swvo/io/kp/read_kp_from_multiple_models.py @@ -16,7 +16,7 @@ import pandas as pd from swvo.io.exceptions import ModelError -from swvo.io.kp import KpEnsemble, KpNiemegk, KpOMNI, KpSWPC +from swvo.io.kp import KpBGS, KpEnsemble, KpNiemegk, KpOMNI, KpSIDC, KpSWPC from swvo.io.utils import ( any_nans, construct_updated_data_frame, @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -KpModel = KpEnsemble | KpNiemegk | KpOMNI | KpSWPC +KpModel = KpEnsemble | KpNiemegk | KpOMNI | KpSWPC | KpBGS | KpSIDC logging.captureWarnings(True) @@ -39,7 +39,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 *, download: bool = False, recurrence: bool = False, - rec_model_order: Sequence[KpOMNI | KpNiemegk] | None = None, + rec_model_order: Sequence[KpOMNI | KpNiemegk | KpBGS] | None = None, ) -> pd.DataFrame | list[pd.DataFrame]: """Read Kp data from multiple models. @@ -67,7 +67,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 recurrence : bool, optional If True, fill missing values using 27-day recurrence from historical models (OMNI, Niemegk). Defaults to False. - rec_model_order : Sequence[KpOMNI | KpNiemegk], optional + rec_model_order : Sequence[KpOMNI | KpNiemegk | KpBGS], optional The order in which historical models will be used for 27-day recurrence filling. Defaults to [OMNI, Niemegk]. @@ -113,7 +113,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 if recurrence: if rec_model_order is None: - rec_model_order = [m for m in model_order if isinstance(m, (KpOMNI, KpNiemegk))] + rec_model_order = [m for m in model_order if isinstance(m, (KpOMNI, KpNiemegk, KpBGS))] for i, df in enumerate(data_out): if not df.empty: data_out[i] = _recursive_fill_27d_historical(df, download, rec_model_order) @@ -157,7 +157,7 @@ def _read_from_model( # noqa: PLR0913 """ # Read from historical models - if isinstance(model, (KpOMNI, KpNiemegk)): + if isinstance(model, (KpOMNI, KpNiemegk, KpBGS)): data_one_model = _read_historical_model( model, start_time, @@ -167,9 +167,9 @@ def _read_from_model( # noqa: PLR0913 ) # Forecasting models are called with synthetic now time - if isinstance(model, KpSWPC): + if isinstance(model, (KpSWPC, KpSIDC)): logger.info( - f"Reading swpc from {historical_data_cutoff_time.replace(hour=0, minute=0, second=0)} to {end_time}\noriginal historical_data_cutoff_time: {historical_data_cutoff_time}" + f"Reading {model.LABEL} from {historical_data_cutoff_time.replace(hour=0, minute=0, second=0)} to {end_time}\noriginal historical_data_cutoff_time: {historical_data_cutoff_time}" ) data_one_model = [ model.read( @@ -191,18 +191,18 @@ def _read_from_model( # noqa: PLR0913 def _read_historical_model( - model: KpOMNI | KpNiemegk, + model: KpOMNI | KpNiemegk | KpBGS, start_time: datetime, end_time: datetime, historical_data_cutoff_time: datetime, *, download: bool, ) -> pd.DataFrame: - """Reads Kp data from historical models (KpOMNI or KpNiemegk) within the specified time range. + """Reads Kp data from historical models (KpOMNI or KpNiemegk or KpBGS) within the specified time range. Parameters ---------- - model : KpOMNI | KpNiemegk + model : KpOMNI | KpNiemegk | KpBGS The historical model from which to read the data. start_time : datetime The start time of the data range. @@ -221,10 +221,10 @@ def _read_historical_model( Raises ------ TypeError - If the provided model is not an instance of KpOMNI or KpNiemegk. + If the provided model is not an instance of KpOMNI or KpNiemegk or KpBGS. """ - if not isinstance(model, (KpOMNI, KpNiemegk)): + if not isinstance(model, (KpOMNI, KpNiemegk, KpBGS)): msg = "Encountered invalide model type in read historical model!" raise TypeError(msg) @@ -355,7 +355,7 @@ def _recursive_fill_27d_historical(df, download, historical_models): DataFrame to fill with gaps. download : bool Download new data or not. - historical_models : list[KpOMNI | KpNiemegk] + historical_models : list[KpOMNI | KpNiemegk | KpBGS] List of historical models to use for filling gaps. value_col : str, optional _description_, by default "kp"