Source code for pm4py.algo.querying.llm.abstractions.log_to_cols_descr

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or 
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Optional, Dict, Any, Union
from pm4py.objects.conversion.log import converter as log_converter
from enum import Enum
from pm4py.util import exec_utils, constants


[docs] class Parameters(Enum): MAX_LEN = "max_len" CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
[docs] def apply( log_obj: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None, ) -> str: if parameters is None: parameters = {} log_obj = log_converter.apply( log_obj, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters, ) max_len = exec_utils.get_param_value( Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN ) case_id_key = exec_utils.get_param_value( Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME ) log_obj = log_obj[[x for x in log_obj.columns if x != case_id_key]] cols_dtypes = {x: str(log_obj[x].dtype) for x in log_obj.columns} num_nans = {} lst_values = [] for x, y in cols_dtypes.items(): num_nans[x] = log_obj[x].isna().sum() if "object" in y or "string" in y or "bool" in y: values = log_obj[x].value_counts().to_dict() if len(values) > 1: for v, co in values.items(): lst_values.append((x, v, co)) lst_values = sorted( lst_values, key=lambda x: (x[2], x[0], x[1]), reverse=True ) ret = {} curr_len = 0 for x, y in cols_dtypes.items(): if "float" in y or "double" in y or "date" in y: quantiles = " quantiles: " + str( log_obj[x].quantile([0.0, 0.25, 0.5, 0.75, 1.0]).to_dict() ) nans = " empty: " + str(num_nans[x]) + " " curr_len += len(x) + len(nans) + len(quantiles) + 1 ret[x] = nans + quantiles for el in lst_values: if curr_len >= max_len: break if not el[0] in ret: stru = " empty: " + str(num_nans[el[0]]) + " values:" ret[el[0]] = stru curr_len += len(el[0]) + len(stru) + 9 stru = " (" + str(el[1]) + "; freq. " + str(el[2]) + ")" ret[el[0]] += stru curr_len += len(stru) keys = sorted(list(ret)) ret = [k + " " + ret[k] for k in keys] return "\n".join(ret)