Source code for pm4py.algo.querying.llm.abstractions.log_to_cols_descr
'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from typing import Optional, Dict, Any, Union
from pm4py.objects.conversion.log import converter as log_converter
from enum import Enum
from pm4py.util import exec_utils, constants
[docs]
class Parameters(Enum):
MAX_LEN = "max_len"
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
[docs]
def apply(
log_obj: Union[EventLog, EventStream, pd.DataFrame],
parameters: Optional[Dict[Any, Any]] = None,
) -> str:
if parameters is None:
parameters = {}
log_obj = log_converter.apply(
log_obj,
variant=log_converter.Variants.TO_DATA_FRAME,
parameters=parameters,
)
max_len = exec_utils.get_param_value(
Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN
)
case_id_key = exec_utils.get_param_value(
Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME
)
log_obj = log_obj[[x for x in log_obj.columns if x != case_id_key]]
cols_dtypes = {x: str(log_obj[x].dtype) for x in log_obj.columns}
num_nans = {}
lst_values = []
for x, y in cols_dtypes.items():
num_nans[x] = log_obj[x].isna().sum()
if "object" in y or "string" in y or "bool" in y:
values = log_obj[x].value_counts().to_dict()
if len(values) > 1:
for v, co in values.items():
lst_values.append((x, v, co))
lst_values = sorted(
lst_values, key=lambda x: (x[2], x[0], x[1]), reverse=True
)
ret = {}
curr_len = 0
for x, y in cols_dtypes.items():
if "float" in y or "double" in y or "date" in y:
quantiles = " quantiles: " + str(
log_obj[x].quantile([0.0, 0.25, 0.5, 0.75, 1.0]).to_dict()
)
nans = " empty: " + str(num_nans[x]) + " "
curr_len += len(x) + len(nans) + len(quantiles) + 1
ret[x] = nans + quantiles
for el in lst_values:
if curr_len >= max_len:
break
if not el[0] in ret:
stru = " empty: " + str(num_nans[el[0]]) + " values:"
ret[el[0]] = stru
curr_len += len(el[0]) + len(stru) + 9
stru = " (" + str(el[1]) + "; freq. " + str(el[2]) + ")"
ret[el[0]] += stru
curr_len += len(stru)
keys = sorted(list(ret))
ret = [k + " " + ret[k] for k in keys]
return "\n".join(ret)