Source code for RES.hdf5_handler

import warnings
from pathlib import Path
from typing import Optional

import geopandas as gpd
import h5py
import pandas as pd
from colorama import Fore, Style
from shapely.geometry.base import BaseGeometry
from shapely.wkt import dumps, loads

import RES.utility as utils


[docs] class DataHandler: """HDF5-based data storage manager for geospatial renewable energy datasets. Provides efficient storage and retrieval of large DataFrame and GeoDataFrame datasets using HDF5 format. Handles geometry serialization for spatial data and implements caching mechanisms for workflow optimization. Parameters ---------- hdf_file_path : Path Path to the HDF5 storage file silent_initiation : bool, default True Suppress initialization messages show_structure : bool, default False Display HDF5 file structure on initialization Attributes ---------- store : Path Path to the HDF5 storage file geom_columns : list Column names containing geometry data for special handling """ def __init__(self, hdf_file_path: Path = None, silent_initiation: Optional[bool] = True, show_structure: Optional[bool] = False): """Initialize HDF5 data handler. Args: hdf_file_path: Path to HDF5 storage file silent_initiation: Suppress initialization messages show_structure: Display file structure after initialization """ try: if hdf_file_path is None: warnings.warn("Store path not set during initialization. Define store path when calling methods.") else: self.store = Path(hdf_file_path) self.geom_columns = ['geometry', 'nearest_connection_point','centroid'] if not silent_initiation: utils.print_update(level=2, message=f"Store initialized: {hdf_file_path}") if show_structure: self.show_tree(self.store) except Exception as e: warnings.warn(f"Error initializing data handler: {e}")
[docs] def to_store(self, data: pd.DataFrame, key: str, hdf_file_path: Path = None, force_update: bool = False): """Save DataFrame or GeoDataFrame to HDF5 storage. Handles geometry serialization for spatial data and implements intelligent updates to avoid data duplication. Args: data: DataFrame or GeoDataFrame to store key: Storage key identifier hdf_file_path: Optional override for storage file path force_update: Force overwrite of existing data Raises: TypeError: If data is not a DataFrame or GeoDataFrame """ if hdf_file_path is not None: self.store = Path(hdf_file_path) if isinstance(data, (pd.DataFrame, gpd.GeoDataFrame)): self.data_new = data.copy() # Proceed with saving to HDF5 else: raise TypeError(f"{__name__}| ❌ to be stored 'data' must be a DataFrame or GeoDataFrame.") store = pd.HDFStore(self.store, mode='a') # Open store in append mode ('a') try: if key not in store or force_update: # Handle GeoDataFrame geometry if present if 'geometry' in self.data_new.columns: if isinstance(self.data_new['geometry'].iloc[0], BaseGeometry): self.data_new['geometry'] = self.data_new['geometry'].apply(dumps) if 'nearest_connection_point' in self.data_new.columns: if isinstance(self.data_new['nearest_connection_point'].iloc[0], BaseGeometry): self.data_new['nearest_connection_point'] = self.data_new['nearest_connection_point'].apply(dumps) # Save the modified data to HDF5 self.data_new.to_hdf(self.store, key=key) utils.print_update(level=3,message=f"{__name__}|💾 Data (GeoDataFrame/DataFrame) saved to {self.store} with key '{key}'") else: # Read existing data from HDF5 self.data_ext = store.get(key) # Add new columns to the existing DataFrame if not present for column in self.data_new.columns: if not data.empty and column not in self.data_ext.columns: self.data_ext[column] = self.data_new[column] # Update the existing DataFrame in HDF5 self.updated_data = self.data_ext for geom_col in self.geom_columns: if geom_col in self.updated_data.columns: if isinstance(self.updated_data[geom_col].iloc[0], BaseGeometry): self.updated_data[geom_col] = self.updated_data[geom_col].apply(dumps) utils.print_update(level=4,message=f"{__name__}| 💾 Updated key :'{key}' with column: '{geom_col}'") self.updated_data.to_hdf(self.store, key=key) utils.print_update(level=3,message=f"{__name__}| 💾 Updated '{key}' saved to {self.store} with key '{key}'") finally: store.close()
[docs] def from_store(self, key: str): """Load data from HDF5 storage with geometry reconstruction. Automatically handles geometry deserialization for spatial datasets and returns appropriate DataFrame or GeoDataFrame objects. Args: key: Storage key identifier Returns: DataFrame or GeoDataFrame with reconstructed geometry columns Raises: KeyError: If storage key is not found """ try: with pd.HDFStore(self.store, 'r') as store: if key not in store: utils.print_update(level=3,message=f"{__name__}| ❌ Error: Key '{key}' not found in {self.store}") return None # Load the data self.data = pd.read_hdf(self.store, key) # Rename 'geometry' back to 'geometry' and convert WKT to geometry if applicable for geom_col in self.geom_columns: if geom_col in self.data.columns: self.data[geom_col] = self.data[geom_col].apply(loads) return gpd.GeoDataFrame(self.data, geometry='geometry', crs='EPSG:4326') # If not geometry, return the regular DataFrame if key == 'timeseries': utils.print_info({__name__}|"'timeseries' key access suggestions: use '.solar' to access Solar-timeseries and '.wind' for Wind-timeseries.") return self.data except Exception as e: utils.print_update(level=3,message=f"{__name__}| ❌ Error loading data from store: {e}") return None
[docs] def refresh(self): """ Initialize a new DataHandler instance with the current store path. This method is useful for reloading the DataHandler with the same store path without needing to reinitialize the entire class. Parameters: None Returns: DataHandler: A new instance of DataHandler with the same store path. """ return DataHandler(self.store, silent_initiation=True, show_structure=False)
[docs] @staticmethod def show_tree(store_path, show_dataset:bool=False): """ This method provides a structured view of the keys and datasets within the HDF5 file, allowing users to understand its organization. parameters: store_path (Path): Path to the HDF5 file. show_dataset (bool): If True, also show datasets within the groups. Raises: Exception: If there is an error reading the file. Returns: None: This method prints the structure to the console. """ def print_structure(name, obj, indent=""): """Helper function to recursively print the structure.""" if isinstance(obj, h5py.Group): print(f"{indent}{Fore.LIGHTBLUE_EX}[key]{Style.RESET_ALL} {Fore.LIGHTGREEN_EX}{name}{Style.RESET_ALL}") # Iterate through the group's keys and call recursively for sub_key in obj.keys(): print_structure(f"{name}/{sub_key}", obj[sub_key], indent + " └─ ") elif show_dataset and isinstance(obj, h5py.Dataset): print(f"{indent}[Dataset] {name} - Shape: {obj.shape}, Type: {obj.dtype}") try: with h5py.File(store_path, 'r') as f: utils.print_module_title(f"{__name__}|🗄️ Structure of HDF5 file: {store_path}") for key in f.keys(): print_structure(key, f[key]) print("\n") utils.print_update(level=1,message="To access the data : ") utils.print_update(level=2,message="<datahandler instance>.from_store('<key>')") except Exception as e: utils.print_update(message=f"{__name__}| ❌ Error reading file: {e}",alert=True)
[docs] def clean_store(self): """ Cleans the HDF5 store by removing all keys and datasets. Parameters: None """ # Remove all keys and datasets from the HDF5 store with h5py.File(self.store, "a") as hdf_file: keys_to_delete = list(hdf_file.keys()) for key in keys_to_delete: del hdf_file[key] utils.print_update(level=3, message=f"{__name__}|🗑️ All keys have been deleted from the store: {self.store}")
[docs] @staticmethod def del_key(store_path, key_to_delete:str): """ Deletes a specific key from the HDF5 file. Parameters: store_path (Path): Path to the HDF5 file. key_to_delete (str): The key to delete from the HDF5 file. Raises: KeyError: If the key does not exist in the HDF5 file. Returns: None: This method prints the status of the deletion operation. Example: >>> DataHandler.del_key(Path('data.h5'), 'my_key') This will delete 'my_key' from the 'data.h5' file if it exists """ # Open the HDF5 file in read/write mode with h5py.File(store_path, "r+") as hdf_file: # Check if the key exists in the file if key_to_delete in hdf_file: del hdf_file[key_to_delete] utils.print_update(level=3,message=f"{__name__}|Key '{key_to_delete}' has been deleted.Store status:\n") DataHandler(store_path).show_tree(store_path) else: utils.print_update(level=3,message=f"{__name__}|Key '{key_to_delete}' not found in the file. Store status:\n") DataHandler(store_path).show_tree(store_path)