Source code for pm4py.algo.clustering.trace_attribute_driven.algorithm
'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
from scipy.cluster.hierarchy import to_tree, linkage
from pm4py.statistics.attributes.log import get as attributes_filter
from pm4py.algo.clustering.trace_attribute_driven.merge_log import merge_log
from pm4py.algo.clustering.trace_attribute_driven.util import evaluation
from pm4py.objects.conversion.log import converter as log_converter
from enum import Enum
from pm4py.util import exec_utils
from typing import Optional, Dict, Any, Union
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
[docs]
class Variants(Enum):
VARIANT_DMM_LEVEN = evaluation.eval_DMM_leven
VARIANT_AVG_LEVEN = evaluation.eval_avg_leven
VARIANT_DMM_VEC = evaluation.eval_DMM_variant
VARIANT_AVG_VEC = evaluation.eval_avg_variant
DFG = evaluation.dfg_dist
VARIANT_DMM_LEVEN = Variants.VARIANT_DMM_LEVEN
VARIANT_AVG_LEVEN = Variants.VARIANT_AVG_LEVEN
VARIANT_DMM_VEC = Variants.VARIANT_DMM_VEC
VARIANT_AVG_VEC = Variants.VARIANT_AVG_VEC
DFG = Variants.DFG
VERSIONS = {
VARIANT_DMM_LEVEN,
VARIANT_AVG_VEC,
VARIANT_DMM_VEC,
VARIANT_AVG_VEC,
DFG,
}
[docs]
def bfs(tree):
queue = []
output = []
queue.append(tree)
while queue:
# element in queue is waiting to become root and splited into child
# root is the first ele of queue
root = queue.pop(0)
if len(root["children"]) > 0:
name = [root["name"]]
for child in root["children"]:
queue.append(child)
name.append(child["name"])
output.append(name)
return output
[docs]
def apply(
log: Union[EventLog, EventStream, pd.DataFrame],
trace_attribute: str,
variant=VARIANT_DMM_LEVEN,
parameters: Optional[Dict[Any, Any]] = None,
) -> Any:
"""
Apply the hierarchical clustering to a log starting from a trace attribute.
MSc Thesis is available at: https://www.pads.rwth-aachen.de/global/show_document.asp?id=aaaaaaaaalpxgft&download=1
Defense slides are available at: https://www.pads.rwth-aachen.de/global/show_document.asp?id=aaaaaaaaalpxgqx&download=1
Parameters
----------------
log
Log
trace_attribute
Trace attribute to exploit for the clustering
variant
Variant of the algorithm to apply, possible values:
- Variants.VARIANT_DMM_LEVEN (that is the default)
- Variants.VARIANT_AVG_LEVEN
- Variants.VARIANT_DMM_VEC
- Variants.VARIANT_AVG_VEC
- Variants.DFG
Returns
-----------------
tree
Hierarchical cluster tree
leafname
Root node
"""
if parameters is None:
parameters = {}
log = log_converter.apply(
log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters
)
percent = 1
alpha = 0.5
list_of_vals = []
list_log = []
list_of_vals_dict = attributes_filter.get_trace_attribute_values(
log, trace_attribute
)
list_of_vals_keys = list(list_of_vals_dict.keys())
for i in range(len(list_of_vals_keys)):
list_of_vals.append(list_of_vals_keys[i])
for i in range(len(list_of_vals)):
logsample = merge_log.log2sublog(log, list_of_vals[i], trace_attribute)
list_log.append(logsample)
y = exec_utils.get_variant(variant)(list_log, percent, alpha)
Z = linkage(y, method="average")
# Create dictionary for labeling nodes by their IDs
id2name = dict(zip(range(len(list_of_vals)), list_of_vals))
T = to_tree(Z, rd=False)
d3Dendro = dict(children=[], name="Root1")
merge_log.add_node(T, d3Dendro)
leafname = merge_log.label_tree(d3Dendro["children"][0], id2name)
d3Dendro = d3Dendro["children"][0]
d3Dendro["name"] = "root"
tree = d3Dendro
trilist = bfs(tree)
trilist[0][0] = trilist[0][1] + "-" + trilist[0][2]
rootlist = []
for ele in trilist:
rootlist.append(ele[0])
return tree, leafname