Source code for pm4py.statistics.attributes.common.get
import json
import logging
import importlib.util
from pm4py.util.points_subset import pick_chosen_points_list
from pm4py.util import exec_utils, pandas_utils, constants
from enum import Enum
[docs]
class Parameters(Enum):
GRAPH_POINTS = "graph_points"
POINT_TO_SAMPLE = "points_to_sample"
[docs]
def get_sorted_attributes_list(attributes):
"""
Gets sorted attributes list
Parameters
----------
attributes
Dictionary of attributes associated with their count
Returns
----------
listact
Sorted end attributes list
"""
listattr = []
for a in attributes:
listattr.append([a, attributes[a]])
listattr = sorted(listattr, key=lambda x: x[1], reverse=True)
return listattr
[docs]
def get_attributes_threshold(
alist, decreasing_factor, min_activity_count=1, max_activity_count=25
):
"""
Get attributes cutting threshold
Parameters
----------
alist
Sorted attributes list
decreasing_factor
Decreasing factor of the algorithm
min_activity_count
Minimum number of activities to include
max_activity_count
Maximum number of activities to include
Returns
---------
threshold
Activities cutting threshold
"""
index = max(0, min(min_activity_count - 1, len(alist) - 1))
threshold = alist[index][1]
index = index + 1
for i in range(index, len(alist)):
value = alist[i][1]
if value > threshold * decreasing_factor:
threshold = value
if i >= max_activity_count:
break
return threshold
[docs]
def get_kde_numeric_attribute(values, parameters=None):
"""
Gets the KDE estimation for the distribution of a numeric attribute values
Parameters
-------------
values
Values of the numeric attribute value
parameters
Possible parameters of the algorithm, including:
graph_points -> number of points to include in the graph
Returns
--------------
x
X-axis values to represent
y
Y-axis values to represent
"""
if importlib.util.find_spec("scipy") and importlib.util.find_spec("numpy"):
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
if parameters is None:
parameters = {}
graph_points = exec_utils.get_param_value(
Parameters.GRAPH_POINTS, parameters, 200
)
values = sorted(values)
density = gaussian_kde(values)
xs1 = list(
np.linspace(min(values), max(values), int(graph_points / 2))
)
xs2 = list(
np.geomspace(
max(min(values), 0.000001), max(values), int(graph_points / 2)
)
)
xs = sorted(xs1 + xs2)
return [xs, list(density(xs))]
else:
msg = "scipy is not available. graphs cannot be built!"
logging.error(msg)
raise Exception(msg)
[docs]
def get_kde_numeric_attribute_json(values, parameters=None):
"""
Gets the KDE estimation for the distribution of a numeric attribute values
(expressed as JSON)
Parameters
--------------
values
Values of the numeric attribute value
parameters
Possible parameters of the algorithm, including:
graph_points: number of points to include in the graph
Returns
--------------
json
JSON representing the graph points
"""
x, y = get_kde_numeric_attribute(values, parameters=parameters)
ret = []
for i in range(len(x)):
ret.append((x[i], y[i]))
return json.dumps(ret)
[docs]
def get_kde_date_attribute(values, parameters=None):
"""
Gets the KDE estimation for the distribution of a date attribute values
Parameters
-------------
values
Values of the date attribute value
parameters
Possible parameters of the algorithm, including:
graph_points -> number of points to include in the graph
Returns
--------------
x
X-axis values to represent
y
Y-axis values to represent
"""
if importlib.util.find_spec("scipy") and importlib.util.find_spec("numpy"):
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
if parameters is None:
parameters = {}
graph_points = exec_utils.get_param_value(
Parameters.GRAPH_POINTS, parameters, 200
)
points_to_sample = exec_utils.get_param_value(
Parameters.POINT_TO_SAMPLE, parameters, 400
)
red_values = pick_chosen_points_list(points_to_sample, values)
int_values = sorted(
[x.replace(tzinfo=None).timestamp() for x in red_values]
)
density = gaussian_kde(int_values)
xs = np.linspace(min(int_values), max(int_values), graph_points)
xs_transf = pd.to_datetime(xs * 10**9, unit="ns")
return [xs_transf, density(xs)]
else:
msg = "scipy is not available. graphs cannot be built!"
logging.error(msg)
raise Exception(msg)
[docs]
def get_kde_date_attribute_json(values, parameters=None):
"""
Gets the KDE estimation for the distribution of a date attribute values
(expressed as JSON)
Parameters
--------------
values
Values of the date attribute value
parameters
Possible parameters of the algorithm, including:
graph_points: number of points to include in the graph
Returns
--------------
json
JSON representing the graph points
"""
x, y = get_kde_date_attribute(values, parameters=parameters)
ret = []
for i in range(len(x)):
ret.append((x[i].replace(tzinfo=None).timestamp(), y[i]))
return json.dumps(ret)