Source code for pm4py.objects.ocel.util.ocel_consistency
'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
from pm4py.objects.ocel.obj import OCEL
from typing import Optional, Dict, Any
import warnings
from pm4py.util import pandas_utils
[docs]
def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None) -> OCEL:
"""
Forces the consistency of the OCEL, ensuring that the event/object identifier,
event/object type are of type string and non-empty.
Parameters
--------------
ocel
OCEL
parameters
Possible parameters of the method
Returns
--------------
ocel
Consistent OCEL
"""
if parameters is None:
parameters = {}
# Store frequently accessed column names locally to reduce attribute lookups
event_id_col = ocel.event_id_column
object_id_col = ocel.object_id_column
event_activity = ocel.event_activity
object_type_col = ocel.object_type_column
# Define fields to process for each dataframe
fields = {
"events": [event_id_col, event_activity],
"objects": [object_id_col, object_type_col],
"relations": [event_id_col, object_id_col, event_activity, object_type_col],
"o2o": [object_id_col, object_id_col + "_2"],
"e2e": [event_id_col, event_id_col + "_2"],
"object_changes": [object_id_col],
}
# Process each dataframe
for tab, columns in fields.items():
# Skip processing if attribute doesn't exist
if not hasattr(ocel, tab):
continue
# Get dataframe
df = getattr(ocel, tab)
# Skip empty dataframes
if df.empty:
continue
# Filter to only columns that exist in this dataframe
valid_columns = [col for col in columns if col in df.columns]
if not valid_columns:
continue
# Check for NA values - only create mask if needed
has_na = df[valid_columns].isna().any().any()
if has_na:
# Create mask for rows without NA values
valid_rows = ~df[valid_columns].isna().any(axis=1)
df = df.loc[valid_rows]
# Convert columns to string type
for col in valid_columns:
df[col] = df[col].astype(str)
# Efficiently filter out empty strings
# Create a single mask for all columns and apply once
valid_rows = pandas_utils.DATAFRAME.Series(True, index=df.index)
for col in valid_columns:
valid_rows &= (df[col].str.len() > 0)
# Only filter if we found empty strings
if not valid_rows.all():
df = df.loc[valid_rows]
# Update OCEL attribute
setattr(ocel, tab, df)
# Check uniqueness efficiently
events_df = ocel.events
objects_df = ocel.objects
# Only check if there are rows to check
if len(events_df) > 0:
num_ev_ids = events_df[event_id_col].nunique()
if num_ev_ids < len(events_df):
warnings.warn("The event identifiers in the OCEL are not unique!")
if len(objects_df) > 0:
num_obj_ids = objects_df[object_id_col].nunique()
if num_obj_ids < len(objects_df):
warnings.warn("The object identifiers in the OCEL are not unique!")
return ocel