Source code for pm4py.statistics.traces.generic.common.case_duration

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
import numpy as np
import json
import logging
import importlib.util
from pm4py.util import exec_utils
from enum import Enum



[docs]
class Parameters(Enum):
    GRAPH_POINTS = "graph_points"
    POINT_TO_SAMPLE = "points_to_sample"




[docs]
def get_kde_caseduration(duration_values, parameters=None):
    """
    Gets the estimation of KDE density for the case durations calculated on the log/dataframe

    Parameters
    --------------
    duration_values
        Values of duration
    parameters
        Possible parameters of the algorithm, including:
            graph_points -> number of points to include in the graph

    Returns
    --------------
    x
        X-axis values to represent
    y
        Y-axis values to represent
    """
    if importlib.util.find_spec("scipy"):
        from scipy.stats import gaussian_kde

        if parameters is None:
            parameters = {}

        graph_points = exec_utils.get_param_value(
            Parameters.GRAPH_POINTS, parameters, 200
        )
        duration_values = sorted(duration_values)
        
        # Check if we have enough data points for KDE
        if len(duration_values) < 2:
            # Return empty or single-point data
            if len(duration_values) == 0:
                return [[], []]
            else:
                # Single value case - return a simple representation
                single_val = duration_values[0]
                return [[single_val], [1.0]]
        
        density = gaussian_kde(duration_values)
        xs1 = list(
            np.linspace(
                min(duration_values),
                max(duration_values),
                int(graph_points / 2),
            )
        )
        xs2 = list(
            np.geomspace(
                max(min(duration_values), 0.001),
                max(duration_values),
                int(graph_points / 2),
            )
        )
        xs = sorted(xs1 + xs2)

        return [xs, list(density(xs))]
    else:
        msg = "scipy is not available. graphs cannot be built!"
        logging.error(msg)
        raise Exception(msg)




[docs]
def get_kde_caseduration_json(duration_values, parameters=None):
    """
    Gets the estimation of KDE density for the case durations calculated on the log/dataframe
    (expressed as JSON)

    Parameters
    --------------
    duration_values
        Values of duration
    parameters
        Possible parameters of the algorithm, including:
            graph_points: number of points to include in the graph

    Returns
    --------------
    json
        JSON representing the graph points
    """
    x, y = get_kde_caseduration(duration_values, parameters=parameters)

    ret = []
    for i in range(len(x)):
        ret.append((x[i], y[i]))

    return json.dumps(ret)