Source code for pm4py.objects.ocel.util.ocel_consistency

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

from pm4py.objects.ocel.obj import OCEL
from typing import Optional, Dict, Any
import warnings
from pm4py.util import pandas_utils



[docs]
def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None) -> OCEL:
    """
    Forces the consistency of the OCEL, ensuring that the event/object identifier,
    event/object type are of type string and non-empty.

    Parameters
    --------------
    ocel
        OCEL
    parameters
        Possible parameters of the method

    Returns
    --------------
    ocel
        Consistent OCEL
    """
    if parameters is None:
        parameters = {}

    # Store frequently accessed column names locally to reduce attribute lookups
    event_id_col = ocel.event_id_column
    object_id_col = ocel.object_id_column
    event_activity = ocel.event_activity
    object_type_col = ocel.object_type_column

    # Define fields to process for each dataframe
    fields = {
        "events": [event_id_col, event_activity],
        "objects": [object_id_col, object_type_col],
        "relations": [event_id_col, object_id_col, event_activity, object_type_col],
        "o2o": [object_id_col, object_id_col + "_2"],
        "e2e": [event_id_col, event_id_col + "_2"],
        "object_changes": [object_id_col],
    }

    # Process each dataframe
    for tab, columns in fields.items():
        # Skip processing if attribute doesn't exist
        if not hasattr(ocel, tab):
            continue

        # Get dataframe
        df = getattr(ocel, tab)

        # Skip empty dataframes
        if df.empty:
            continue

        # Filter to only columns that exist in this dataframe
        valid_columns = [col for col in columns if col in df.columns]
        if not valid_columns:
            continue

        # Check for NA values - only create mask if needed
        has_na = df[valid_columns].isna().any().any()
        if has_na:
            # Create mask for rows without NA values
            valid_rows = ~df[valid_columns].isna().any(axis=1)
            df = df.loc[valid_rows]

        # Convert columns to string type
        for col in valid_columns:
            df[col] = df[col].astype(str)

        # Efficiently filter out empty strings
        # Create a single mask for all columns and apply once
        valid_rows = pandas_utils.DATAFRAME.Series(True, index=df.index)
        for col in valid_columns:
            valid_rows &= (df[col].str.len() > 0)

        # Only filter if we found empty strings
        if not valid_rows.all():
            df = df.loc[valid_rows]

        # Update OCEL attribute
        setattr(ocel, tab, df)

    # Check uniqueness efficiently
    events_df = ocel.events
    objects_df = ocel.objects

    # Only check if there are rows to check
    if len(events_df) > 0:
        num_ev_ids = events_df[event_id_col].nunique()
        if num_ev_ids < len(events_df):
            warnings.warn("The event identifiers in the OCEL are not unique!")

    if len(objects_df) > 0:
        num_obj_ids = objects_df[object_id_col].nunique()
        if num_obj_ids < len(objects_df):
            warnings.warn("The object identifiers in the OCEL are not unique!")

    return ocel