Source code for pm4py.algo.querying.llm.abstractions.log_to_dfg_descr
'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics
from typing import Union, Optional, Dict, Any, Tuple
from pm4py.objects.log.obj import EventLog, EventStream
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
import pandas as pd
import math
import numpy as np
[docs]
class Parameters(Enum):
INCLUDE_FREQUENCY = "include_frequency"
INCLUDE_PERFORMANCE = "include_performance"
MAX_LEN = "max_len"
RELATIVE_FREQUENCY = "relative_frequency"
RESPONSE_HEADER = "response_header"
PRIMARY_PERFORMANCE_AGGREGATION = "primary_performance_aggregation"
SECONDARY_PERFORMANCE_AGGREGATION = "secondary_performance_aggregation"
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
[docs]
def abstraction_from_frequency_performance_dfg(
freq_dfg: Dict[Tuple[str, str], int],
perf_dfg: Dict[Tuple[str, str], Dict[str, float]],
parameters: Optional[Dict[Any, Any]] = None,
) -> str:
"""
Obtains the abstraction starting from the knowledge of the frequency of the paths, and their performance.
Minimal viable example:
import pm4py
from pm4py.algo.querying.llm.abstractions import log_to_dfg_descr
log = pm4py.read_xes('tests/input_data/running-example.xes')
freq_dfg, sa, ea = pm4py.discover_dfg(log)
perf_dfg, sa, ea = pm4py.discover_performance_dfg(log)
print(log_to_dfg_descr.abstraction_from_frequency_performance_dfg(freq_dfg, perf_dfg))
Parameters
---------------
freq_dfg
Dictionary associating to each path its frequency
perf_dfg
Dictionary associating to each path its performance. A path ('A', 'B') is associated to a dictionary
containing performance metrics, i.e. ('A', 'B'): {'mean': 86400, 'stdev': 86400} means that:
- the average time between the activities A and B is 1 day
- also the standard deviation of the times between A and B is 1 day
parameters
Optional parameters of the algorithm, including:
- Parameters.RELATIVE_FREQUENCY => (boolean) decides if the frequency DFG should be normalized to a relative
frequency
- Parameters.INCLUDE_FREQUENCY => includes the frequency of the arcs in the textual abstraction
- Parameters.INCLUDE_PERFORMANCE => includes the performance of the arcs in the textual abstraction
- Parameters.MAX_LEN => desidered length of the textual abstraction
- Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
- Parameters.PRIMARY_PERFORMANCE_AGGREGATION => primary performance metric to be used to express the performance of the arcs (e.g., mean). Available options: mean, median, stdev, min, max, sum
- Parameters.SECONDARY_PERFORMANCE_AGGREGATION => secondary performance metric to be used to express the performance of the arcs (e.g., stdev). Available options: mean, median, stdev, min, max, sum
Returns
---------------
textual_abstraction
Textual abstraction
"""
if parameters is None:
parameters = {}
relative_frequency = exec_utils.get_param_value(
Parameters.RELATIVE_FREQUENCY, parameters, False
)
include_frequency = exec_utils.get_param_value(
Parameters.INCLUDE_FREQUENCY, parameters, True
)
include_performance = exec_utils.get_param_value(
Parameters.INCLUDE_PERFORMANCE, parameters, True
)
max_len = exec_utils.get_param_value(
Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN
)
response_header = exec_utils.get_param_value(
Parameters.RESPONSE_HEADER, parameters, True
)
primary_performance_aggregation = exec_utils.get_param_value(
Parameters.PRIMARY_PERFORMANCE_AGGREGATION, parameters, "mean"
)
secondary_performance_aggregation = exec_utils.get_param_value(
Parameters.SECONDARY_PERFORMANCE_AGGREGATION, parameters, None
)
paths = sorted(
[(x, y) for x, y in freq_dfg.items()],
key=lambda z: (z[1], z[0]),
reverse=True,
)
paths = [x[0] for x in paths]
ret = "If I have a process with flow:\n\n" if response_header else "\n\n"
for p in paths:
if len(ret) > max_len:
break
stru = "%s -> %s " % (p[0], p[1])
if include_frequency or include_performance:
stru = stru + "("
if include_frequency:
stru = stru + " frequency = "
stru = stru + str(freq_dfg[p])
if relative_frequency:
stru = stru + "\\%"
stru = stru + " "
if include_performance:
stru = stru + " performance = "
stru = (
stru
+ "%.3f" % perf_dfg[p][primary_performance_aggregation]
)
stru = stru + " "
if (
secondary_performance_aggregation is not None
and secondary_performance_aggregation in perf_dfg[p]
and perf_dfg[p][secondary_performance_aggregation]
is not None
and not np.isnan(
perf_dfg[p][secondary_performance_aggregation]
)
):
stru = (
stru + " " + secondary_performance_aggregation + " = "
)
stru = (
stru
+ "%.3f"
% perf_dfg[p][secondary_performance_aggregation]
)
stru = stru + " "
stru = stru + ")\n"
ret = ret + stru
ret = ret + "\n\n"
return ret
[docs]
def apply(
log_obj: Union[EventLog, EventStream, pd.DataFrame],
parameters: Optional[Dict[Any, Any]] = None,
) -> str:
"""
Gets the textual abstraction of the directly-follows graph computed on the provided log object.
Minimal viable example:
import pm4py
from pm4py.algo.querying.llm.abstractions import log_to_dfg_descr
log = pm4py.read_xes('tests/input_data/running-example.xes')
print(log_to_dfg_descr.apply(log))
Parameters
---------------
log_obj
Log object (event log / Pandas dataframe)
parameters
Optional Parameters of the algorithm, including:
- Parameters.ACTIVITY_KEY => the attribute to be used as activity
- Parameters.TIMESTAMP_KEY => the attribute to be used as timestamp
- Parameters.CASE_ID_KEY => the attribute to be used as case ID
- Parameters.RELATIVE_FREQUENCY => (boolean) decides if the frequency DFG should be normalized to a relative
frequency
- Parameters.INCLUDE_FREQUENCY => includes the frequency of the arcs in the textual abstraction
- Parameters.INCLUDE_PERFORMANCE => includes the performance of the arcs in the textual abstraction
- Parameters.MAX_LEN => desidered length of the textual abstraction
- Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
- Parameters.PRIMARY_PERFORMANCE_AGGREGATION => primary performance metric to be used to express the performance of the arcs (e.g., mean). Available options: mean, median, stdev, min, max, sum
- Parameters.SECONDARY_PERFORMANCE_AGGREGATION => secondary performance metric to be used to express the performance of the arcs (e.g., stdev). Available options: mean, median, stdev, min, max, sum
Returns
---------------
textual_abstraction
Textual abstraction
"""
if parameters is None:
parameters = {}
activity_key = exec_utils.get_param_value(
Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY
)
timestamp_key = exec_utils.get_param_value(
Parameters.TIMESTAMP_KEY,
parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY,
)
case_id_key = exec_utils.get_param_value(
Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME
)
relative_frequency = exec_utils.get_param_value(
Parameters.RELATIVE_FREQUENCY, parameters, False
)
log_obj = log_converter.apply(
log_obj,
variant=log_converter.Variants.TO_DATA_FRAME,
parameters=parameters,
)
freq_dfg, perf_dfg = df_statistics.get_dfg_graph(
log_obj,
measure="both",
perf_aggregation_key="all",
activity_key=activity_key,
case_id_glue=case_id_key,
timestamp_key=timestamp_key,
)
if relative_frequency:
freq_dfg = df_statistics.get_dfg_graph(
log_obj,
measure="frequency",
activity_key=activity_key,
case_id_glue=case_id_key,
timestamp_key=timestamp_key,
keep_once_per_case=True,
)
num_cases = log_obj[case_id_key].nunique()
freq_dfg = {
x: max(1, math.floor((y * 100.0) / num_cases))
for x, y in freq_dfg.items()
}
return abstraction_from_frequency_performance_dfg(
freq_dfg, perf_dfg, parameters=parameters
)