Source code for pm4py.algo.discovery.ocel.link_analysis.variants.classic

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
from enum import Enum

from pm4py.util import exec_utils, constants, xes_constants, pandas_utils
from typing import Optional, Dict, Any, Set
import pandas as pd


[docs] class Parameters(Enum): OUT_COLUMN = "out_column" IN_COLUMN = "in_column" SORTING_COLUMN = "sorting_column" INDEX_COLUMN = "index_column" LOOK_FORWARD = "look_forward" KEEP_FIRST_OCCURRENCE = "keep_first_occurrence" PROPAGATE = "propagate"
[docs] def propagate_associations( associations: Dict[str, Set[str]] ) -> Dict[str, Set[str]]: """ Propagate the associations, such that the eventually-follows flow between the events of the event log is considered Parameters ------------------- associations Associations between events Returns ------------------ propagated_associations Propagated associations """ reverse_dict = {} for x in associations: for k in associations[x]: if k not in reverse_dict: reverse_dict[k] = set() reverse_dict[k].add(x) change_dict = {x: True for x in associations} to_change = [x for x in change_dict if change_dict[x]] while to_change: for x in to_change: change_dict[x] = False for x in to_change: if x in reverse_dict: rv = reverse_dict[x] for k in rv: new_set = associations[k].union(associations[x]) if len(new_set) > len(associations[k]): change_dict[k] = True associations[k] = new_set to_change = [x for x in change_dict if change_dict[x]] return associations
[docs] def apply( dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None ) -> pd.DataFrame: """ Performs a link analysis between the entries of the current dataframe. The link analysis permits advanced filtering based on events connected in an output-input relation (e.g., the OUT column of the first is equal to the IN column of the second). When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph (when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph (when Parameters.KEEP_FIRST_OCCURRENCE = False). Parameters ----------------- dataframe Pandas dataframe parameters Parameters of the algorithm, including: - Parameters.OUT_COLUMN => the output column of the dataframe - Parameters.IN_COLUMN => the input column of the dataframe - Parameters.SORTING_COLUMN => the column on top of which the - Parameters.INDEX_COLUMN => the attribute to use for the indexing - Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index of the first event. - Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship with a target event (OUT=IN). - Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document flow chain can be reconstructed. Returns ----------------- link_analysis_dataframe Link analysis dataframe """ if parameters is None: parameters = {} out_column = exec_utils.get_param_value( Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME ) in_column = exec_utils.get_param_value( Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME ) sorting_column = exec_utils.get_param_value( Parameters.SORTING_COLUMN, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY, ) index_column = exec_utils.get_param_value( Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY ) look_forward = exec_utils.get_param_value( Parameters.LOOK_FORWARD, parameters, True ) keep_first_occurrence = exec_utils.get_param_value( Parameters.KEEP_FIRST_OCCURRENCE, parameters, False ) propagate = exec_utils.get_param_value( Parameters.PROPAGATE, parameters, False ) dataframe = dataframe.sort_values(sorting_column) dataframe = pandas_utils.insert_index(dataframe, index_column) df_red1 = dataframe[[out_column, index_column]] df_red2 = dataframe[[in_column, index_column]] df_red = df_red1.merge( df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in"), ) if look_forward: df_red = df_red[ df_red[index_column + "_out"] < df_red[index_column + "_in"] ] if keep_first_occurrence: df_red = df_red.groupby(index_column + "_out").first().reset_index() stream_red = df_red.to_dict("records") associations = {} for el in stream_red: if not el[index_column + "_out"] in associations: associations[el[index_column + "_out"]] = set() associations[el[index_column + "_out"]].add(el[index_column + "_in"]) if propagate: associations = propagate_associations(associations) out_clmn = [] in_clmn = [] for k in associations: for v in associations[k]: out_clmn.append(k) in_clmn.append(v) rel = pandas_utils.instantiate_dataframe( {index_column + "_out": out_clmn, index_column + "_in": in_clmn} ) df_link = dataframe.copy() df_link.columns = [x + "_out" for x in df_link.columns] df_link = df_link.merge( rel, left_on=index_column + "_out", right_on=index_column + "_out" ) dataframe.columns = [x + "_in" for x in dataframe.columns] df_link = df_link.merge( dataframe, left_on=index_column + "_in", right_on=index_column + "_in" ) return df_link