Source code for optimpv.general.general

"""General functions"""
######### Package Imports #########################################################################

from sklearn.metrics import max_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error, root_mean_squared_error, root_mean_squared_log_error, median_absolute_error
import numpy as np
from scipy.spatial import distance

######### Function Definitions ####################################################################
[docs] def calc_metric(y,yfit,sample_weight=None,metric_name='mse'): """Calculate the metric between the true values and the predicted values Parameters ---------- y : array-like of shape (n_samples,) True values yfit : array-like of shape (n_samples,) Predicted values sample_weight : array-like of shape (n_samples,), optional Sample weights, by default None metric_name : str, optional Name of the metric to calculate, by default 'mse' Possible values are: - 'mse': Mean Squared Error - 'mae': Mean Absolute Error - 'mape': Mean Absolute Percentage Error - 'msle': Mean Squared Log Error - 'rmsle': Root Mean Squared Log Error - 'rmse': Root Mean Squared Error - 'medae': Median Absolute Error - 'nrmse': Normalized Root Mean Squared Error - 'rmsre': Root Mean Squared Relative Error Returns ------- float The calculated metric Raises ------ ValueError If the metric is not implemented """ # check is nan values are present if np.isnan(y).any() or np.isnan(yfit).any(): return np.nan if metric_name.lower() == 'mse': return mean_squared_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'mae': return mean_absolute_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'mape': return mean_absolute_percentage_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'msle': return mean_squared_log_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'rmsle': return root_mean_squared_log_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'rmse': return root_mean_squared_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'medae': return median_absolute_error(y, yfit, sample_weight=sample_weight) elif metric_name.lower() == 'nrmse': maxi = max(np.max(y),np.max(yfit)) mini = min(np.min(y),np.min(yfit)) return root_mean_squared_error(y, yfit,sample_weight=sample_weight)/(maxi-mini) elif metric_name.lower() == 'rmsre': epsilon = np.finfo(np.float64).eps return np.sqrt(np.mean(((y-yfit)/np.maximum(np.abs(y),epsilon))**2)) elif metric_name.lower() == 'maxe': return max_error(y, yfit) else: raise ValueError('The metric '+metric_name+' is not implemented.')
[docs] def loss_function(value,loss='linear'): """Calculate the loss function for the given value. Inspired by the scipy loss functions (https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.least_squares.html). The following loss functions are implemented: * 'linear' (default) : ``rho(z) = z``. Gives a standard least-squares problem. * 'soft_l1' : ``rho(z) = 2 * ((1 + z)**0.5 - 1)``. The smooth approximation of l1 (absolute value) loss. Usually a good choice for robust least squares. * 'huber' : ``rho(z) = z if z <= 1 else 2*z**0.5 - 1``. Works similarly to 'soft_l1'. * 'cauchy' : ``rho(z) = ln(1 + z)``. Severely weakens outliers influence, but may cause difficulties in optimization process. * 'arctan' : ``rho(z) = arctan(z)``. Limits a maximum loss on a single residual, has properties similar to 'cauchy'. * 'log' : ``rho(z) = log( z)``. Logarithmically scales the loss, very similar to 'cauchy' but not as safe. * 'log10' : ``rho(z) = log10(z)``. Logarithmically scales the loss with base 10 log, very similar to 'cauchy' but not as safe. Parameters ---------- value : float value to calculate the loss function loss : str, optional loss function to use, by default Returns ------- float value of the loss function Raises ------ ValueError If the loss function is not implemented """ if loss.lower() == 'linear' : return value elif loss.lower() == 'log': return np.log(abs(value)) elif loss.lower() == 'log10': return np.log10(abs(value)) elif loss.lower() == 'soft_l1': return 2 * ((1 + value)**0.5 - 1) elif loss.lower() == 'cauchy': return np.log(1 + value) elif loss.lower() == 'arctan': return np.arctan(value) elif loss.lower() == 'huber': if abs(value) <= 1: return value else: return 2 * value**0.5 - 1 else: raise ValueError('The loss '+loss+' is not implemented.')
[docs] def inv_loss_function(value,loss='linear'): """Calculate the inverse loss function for the given value. Inspired by the scipy loss functions (https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.least_squares.html). The following loss functions are implemented: * 'linear' (default) : ``rho(z) = z``. Gives a standard least-squares problem. * 'soft_l1' : ``rho(z) = 2 * ((1 + z)**0.5 - 1)``. The smooth approximation of l1 (absolute value) loss. Usually a good choice for robust least squares. * 'huber' : ``rho(z) = z if z <= 1 else 2*z**0.5 - 1``. Works similarly to 'soft_l1'. * 'cauchy' : ``rho(z) = ln(1 + z)``. Severely weakens outliers influence, but may cause difficulties in optimization process. * 'arctan' : ``rho(z) = arctan(z)``. Limits a maximum loss on a single residual, has properties similar to 'cauchy'. * 'log' : ``rho(z) = log( z)``. Logarithmically scales the loss, very similar to 'cauchy' but not as safe. * 'log10' : ``rho(z) = log10(z)``. Logarithmically scales the loss with base 10 log, very similar to 'cauchy' but not as safe. Parameters ---------- value : float value to calculate the inverse loss function loss : str, optional loss function to use, by default 'linear' Returns ------- float value of the inverse loss function Raises ------ ValueError If the loss function is not implemented """ if loss.lower() == 'linear' : return value elif loss.lower() == 'log': return np.exp(value) elif loss.lower() == 'log10': return 10**value elif loss.lower() == 'soft_l1': return ((1 + value / 2)**2 - 1) elif loss.lower() == 'cauchy': return np.exp(value) - 1 elif loss.lower() == 'arctan': return np.tan(value) elif loss.lower() == 'huber': if type(value) == np.ndarray: value = np.asarray(value) result = np.where(np.abs(value) <= 1, value, 0.5 * (value + 1)**2) else: if abs(value) <= 1: return value else: return 0.5 * (value + 1)**2 return result else: raise ValueError('The loss '+loss+' is not implemented.')
[docs] def mean_min_euclidean_distance(X_true, y_true, X_fit, y_fit): """Calculate the minimum euclidean distance between the true and the predicted values Parameters ---------- X_true : array-like of shape (n_samples,) True values of the X coordinate y_true : array-like of shape (n_samples,) True values of the y coordinate X_fit : array-like of shape (n_samples,) Predicted values of the X coordinate y_fit : array-like of shape (n_samples,) Predicted values of the y coordinate Returns ------- float The average minimum euclidian distance between the true and the predicted values """ Xy_true = np.hstack((X_true.reshape(-1,1),y_true.reshape(-1,1))) Xy_fit = np.hstack((X_fit.reshape(-1,1),y_fit.reshape(-1,1))) dists = [] for i in range(len(Xy_true)): dd = [] for j in range(len(Xy_fit)): if i != j: dd.append(distance.euclidean(Xy_true[i], Xy_fit[j])) dists.append(np.min(dd)) return np.mean(dists)
[docs] def direct_mean_euclidean_distance(X_true, y_true, X_fit, y_fit): """Calculate the mean euclidean distance between the true and the predicted values Parameters ---------- X_true : array-like of shape (n_samples,) True values of the X coordinate y_true : array-like of shape (n_samples,) True values of the y coordinate X_fit : array-like of shape (n_samples,) Predicted values of the X coordinate y_fit : array-like of shape (n_samples,) Predicted values of the y coordinate Returns ------- float The average euclidian distance between the true and the predicted values """ Xy_true = np.hstack((X_true.reshape(-1,1),y_true.reshape(-1,1))) Xy_fit = np.hstack((X_fit.reshape(-1,1),y_fit.reshape(-1,1))) dists = [] for i in range(len(Xy_true)): dists.append(distance.euclidean(Xy_true[i], Xy_fit[i])) return np.mean(dists)
[docs] def transform_data(y, y_pred, X=None, X_pred=None, transform_type='linear', epsilon=None): """Transform data according to specified transformation type Parameters ---------- y : array-like True values to transform y_pred : array-like Predicted values to transform alongside y X : array-like, optional X coordinates of true values, by default None X_pred : array-like, optional X coordinates of predicted values, by default None transform_type : str, optional Type of transformation to apply, by default 'linear' Possible values are: - 'linear': No transformation - 'log': Log10 transformation of absolute values - 'normalized': Division by maximum value - 'normalized_log': Normalization followed by log transformation - 'sqrt': Square root transformation epsilon : float, optional Small value to add to avoid log(0), by default the machine epsilon for float64 Returns ------- tuple of array-like (y_transformed, y_pred_transformed) Raises ------ ValueError If the transformation type is not implemented """ # Make deep copies to avoid modifying the original data y_transformed = np.copy(y) y_pred_transformed = np.copy(y_pred) # Set epsilon to machine epsilon if not provided if epsilon is None: epsilon = np.finfo(np.float64).eps if transform_type.lower() == 'linear': return y_transformed, y_pred_transformed elif transform_type.lower() == 'log': # Replace zeros with epsilon to avoid log(0) y_transformed = np.abs(y_transformed) y_transformed[y_transformed <= 0] = epsilon y_pred_transformed = np.abs(y_pred_transformed) y_pred_transformed[y_pred_transformed <= 0] = epsilon return np.log10(y_transformed), np.log10(y_pred_transformed) elif transform_type.lower() == 'normalized': y_transformed = y_transformed/np.max(y_transformed) # Normalize to [0, 1] y_pred_transformed = y_pred_transformed/np.max(y_pred_transformed) return y_transformed, y_pred_transformed # # Find the maximum value across both arrays for consistent normalization # max_val = max(np.max(np.abs(y_transformed)), np.max(np.abs(y_pred_transformed))) # if max_val > 0: # Avoid division by zero # return y_transformed / max_val, y_pred_transformed / max_val # return y_transformed, y_pred_transformed elif transform_type.lower() == 'normalized_log': # First normalize using the combined max value # max_val = max(np.max(np.abs(y_transformed)), np.max(np.abs(y_pred_transformed))) # if max_val > 0: # Avoid division by zero # y_transformed = y_transformed / max_val # y_pred_transformed = y_pred_transformed / max_val y_transformed = y_transformed/np.max(y_transformed) # Normalize to [0, 1] y_pred_transformed = y_pred_transformed/np.max(y_pred_transformed) # Then log transform y_transformed = np.abs(y_transformed) y_transformed[y_transformed <= 0] = epsilon y_pred_transformed = np.abs(y_pred_transformed) y_pred_transformed[y_pred_transformed <= 0] = epsilon return np.log10(y_transformed), np.log10(y_pred_transformed) elif transform_type.lower() == 'sqrt': # Ensure values are non-negative for sqrt y_transformed[y_transformed < 0] = 0 y_pred_transformed[y_pred_transformed < 0] = 0 return np.sqrt(y_transformed), np.sqrt(y_pred_transformed) else: raise ValueError(f'The transformation type {transform_type} is not implemented.')