Source code for pm4py.algo.querying.llm.abstractions.log_to_dfg_descr

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics
from typing import Union, Optional, Dict, Any, Tuple
from pm4py.objects.log.obj import EventLog, EventStream
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
import pandas as pd
import math
import numpy as np



[docs]
class Parameters(Enum):
    INCLUDE_FREQUENCY = "include_frequency"
    INCLUDE_PERFORMANCE = "include_performance"
    MAX_LEN = "max_len"
    RELATIVE_FREQUENCY = "relative_frequency"
    RESPONSE_HEADER = "response_header"
    PRIMARY_PERFORMANCE_AGGREGATION = "primary_performance_aggregation"
    SECONDARY_PERFORMANCE_AGGREGATION = "secondary_performance_aggregation"
    ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
    TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
    CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY




[docs]
def abstraction_from_frequency_performance_dfg(
    freq_dfg: Dict[Tuple[str, str], int],
    perf_dfg: Dict[Tuple[str, str], Dict[str, float]],
    parameters: Optional[Dict[Any, Any]] = None,
) -> str:
    """
    Obtains the abstraction starting from the knowledge of the frequency of the paths, and their performance.

    Minimal viable example:

        import pm4py
        from pm4py.algo.querying.llm.abstractions import log_to_dfg_descr

        log = pm4py.read_xes('tests/input_data/running-example.xes')
        freq_dfg, sa, ea = pm4py.discover_dfg(log)
        perf_dfg, sa, ea = pm4py.discover_performance_dfg(log)
        print(log_to_dfg_descr.abstraction_from_frequency_performance_dfg(freq_dfg, perf_dfg))

    Parameters
    ---------------
    freq_dfg
        Dictionary associating to each path its frequency
    perf_dfg
        Dictionary associating to each path its performance. A path ('A', 'B') is associated to a dictionary
        containing performance metrics, i.e. ('A', 'B'): {'mean': 86400, 'stdev': 86400} means that:
        - the average time between the activities A and B is 1 day
        - also the standard deviation of the times between A and B is 1 day
    parameters
        Optional parameters of the algorithm, including:
                - Parameters.RELATIVE_FREQUENCY => (boolean) decides if the frequency DFG should be normalized to a relative
                                            frequency
                - Parameters.INCLUDE_FREQUENCY => includes the frequency of the arcs in the textual abstraction
                - Parameters.INCLUDE_PERFORMANCE => includes the performance of the arcs in the textual abstraction
                - Parameters.MAX_LEN => desidered length of the textual abstraction
                - Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
                - Parameters.PRIMARY_PERFORMANCE_AGGREGATION => primary performance metric to be used to express the performance of the arcs (e.g., mean). Available options: mean, median, stdev, min, max, sum
                - Parameters.SECONDARY_PERFORMANCE_AGGREGATION => secondary performance metric to be used to express the performance of the arcs (e.g., stdev). Available options: mean, median, stdev, min, max, sum
    Returns
    ---------------
    textual_abstraction
        Textual abstraction
    """
    if parameters is None:
        parameters = {}

    relative_frequency = exec_utils.get_param_value(
        Parameters.RELATIVE_FREQUENCY, parameters, False
    )
    include_frequency = exec_utils.get_param_value(
        Parameters.INCLUDE_FREQUENCY, parameters, True
    )
    include_performance = exec_utils.get_param_value(
        Parameters.INCLUDE_PERFORMANCE, parameters, True
    )
    max_len = exec_utils.get_param_value(
        Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN
    )
    response_header = exec_utils.get_param_value(
        Parameters.RESPONSE_HEADER, parameters, True
    )
    primary_performance_aggregation = exec_utils.get_param_value(
        Parameters.PRIMARY_PERFORMANCE_AGGREGATION, parameters, "mean"
    )
    secondary_performance_aggregation = exec_utils.get_param_value(
        Parameters.SECONDARY_PERFORMANCE_AGGREGATION, parameters, None
    )

    paths = sorted(
        [(x, y) for x, y in freq_dfg.items()],
        key=lambda z: (z[1], z[0]),
        reverse=True,
    )
    paths = [x[0] for x in paths]

    ret = "If I have a process with flow:\n\n" if response_header else "\n\n"
    for p in paths:
        if len(ret) > max_len:
            break
        stru = "%s -> %s " % (p[0], p[1])
        if include_frequency or include_performance:
            stru = stru + "("
            if include_frequency:
                stru = stru + " frequency = "
                stru = stru + str(freq_dfg[p])
                if relative_frequency:
                    stru = stru + "\\%"
                stru = stru + " "
            if include_performance:
                stru = stru + " performance = "
                stru = (
                    stru
                    + "%.3f" % perf_dfg[p][primary_performance_aggregation]
                )
                stru = stru + " "
                if (
                    secondary_performance_aggregation is not None
                    and secondary_performance_aggregation in perf_dfg[p]
                    and perf_dfg[p][secondary_performance_aggregation]
                    is not None
                    and not np.isnan(
                        perf_dfg[p][secondary_performance_aggregation]
                    )
                ):
                    stru = (
                        stru + " " + secondary_performance_aggregation + " = "
                    )
                    stru = (
                        stru
                        + "%.3f"
                        % perf_dfg[p][secondary_performance_aggregation]
                    )
                    stru = stru + " "
            stru = stru + ")\n"
        ret = ret + stru
    ret = ret + "\n\n"
    return ret




[docs]
def apply(
    log_obj: Union[EventLog, EventStream, pd.DataFrame],
    parameters: Optional[Dict[Any, Any]] = None,
) -> str:
    """
    Gets the textual abstraction of the directly-follows graph computed on the provided log object.

    Minimal viable example:
        import pm4py
        from pm4py.algo.querying.llm.abstractions import log_to_dfg_descr

        log = pm4py.read_xes('tests/input_data/running-example.xes')
        print(log_to_dfg_descr.apply(log))

    Parameters
    ---------------
    log_obj
        Log object (event log / Pandas dataframe)
    parameters
        Optional Parameters of the algorithm, including:
            - Parameters.ACTIVITY_KEY => the attribute to be used as activity
            - Parameters.TIMESTAMP_KEY => the attribute to be used as timestamp
            - Parameters.CASE_ID_KEY => the attribute to be used as case ID
            - Parameters.RELATIVE_FREQUENCY => (boolean) decides if the frequency DFG should be normalized to a relative
                                                frequency
            - Parameters.INCLUDE_FREQUENCY => includes the frequency of the arcs in the textual abstraction
            - Parameters.INCLUDE_PERFORMANCE => includes the performance of the arcs in the textual abstraction
            - Parameters.MAX_LEN => desidered length of the textual abstraction
            - Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
            - Parameters.PRIMARY_PERFORMANCE_AGGREGATION => primary performance metric to be used to express the performance of the arcs (e.g., mean). Available options: mean, median, stdev, min, max, sum
            - Parameters.SECONDARY_PERFORMANCE_AGGREGATION => secondary performance metric to be used to express the performance of the arcs (e.g., stdev). Available options: mean, median, stdev, min, max, sum

    Returns
    ---------------
    textual_abstraction
        Textual abstraction
    """
    if parameters is None:
        parameters = {}

    activity_key = exec_utils.get_param_value(
        Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY
    )
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY,
        parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY,
    )
    case_id_key = exec_utils.get_param_value(
        Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME
    )
    relative_frequency = exec_utils.get_param_value(
        Parameters.RELATIVE_FREQUENCY, parameters, False
    )

    log_obj = log_converter.apply(
        log_obj,
        variant=log_converter.Variants.TO_DATA_FRAME,
        parameters=parameters,
    )
    freq_dfg, perf_dfg = df_statistics.get_dfg_graph(
        log_obj,
        measure="both",
        perf_aggregation_key="all",
        activity_key=activity_key,
        case_id_glue=case_id_key,
        timestamp_key=timestamp_key,
    )
    if relative_frequency:
        freq_dfg = df_statistics.get_dfg_graph(
            log_obj,
            measure="frequency",
            activity_key=activity_key,
            case_id_glue=case_id_key,
            timestamp_key=timestamp_key,
            keep_once_per_case=True,
        )
        num_cases = log_obj[case_id_key].nunique()
        freq_dfg = {
            x: max(1, math.floor((y * 100.0) / num_cases))
            for x, y in freq_dfg.items()
        }

    return abstraction_from_frequency_performance_dfg(
        freq_dfg, perf_dfg, parameters=parameters
    )