Source code for pm4py.util.string_distance

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
import sys
import importlib.util
from typing import List, Union


[docs] def levenshtein_distance(s1, s2): if len(s1) < len(s2): return levenshtein_distance(s2, s1) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]
[docs] def levenshtein(stru1, stru2): if importlib.util.find_spec("stringdist"): import stringdist return stringdist.levenshtein(stru1, stru2) return levenshtein_distance(stru1, stru2)
[docs] def argmin_levenshtein(stru: str, list_stri: List[str]) -> Union[str, None]: """ Given a string (stru), finds a string in a list of strings (list_stri) that minimizes the Levenshtein distance. Parameters -------------- stru String (that is compared) list_stri List of comparison strings Returns -------------- argmin_dist String (belonging to list_stri) that minimizes the Levenshtein distance with the 'stru' argument """ if list_stri: len_stru = len(stru) argmin_dist = None min_edit_dist = sys.maxsize # sort the strings in the list based on the actual length in comparison # to the provided string list_stri = sorted(list_stri, key=lambda x: abs(len(x) - len_stru)) for comp_stri in list_stri: this_length_diff = abs(len(comp_stri) - len_stru) # to make things faster, breaks whether we reach a string # which distance is greater or equal than the edit distance # with a previous string, because the edit distance would then # be trivially greater or equal in length if this_length_diff >= min_edit_dist: break dist_this_comp = levenshtein(stru, comp_stri) if dist_this_comp < min_edit_dist: argmin_dist = comp_stri min_edit_dist = dist_this_comp return argmin_dist return None
[docs] def argmax_levenshtein(stru: str, list_stri: List[str]) -> Union[str, None]: """ Given a string (stru), finds a string in a list of strings (list_stri) that maximizes the Levenshtein distance. Parameters -------------- stru String (that is compared) list_stri List of comparison strings Returns -------------- argmax_dist String (belonging to list_stri) that maximizes the Levenshtein distance with the 'stru' argument """ if list_stri: len_stru = len(stru) argmax_dist = None max_edit_dist = 0 # sort the strings in the list based on the actual length in comparison # to the provided string list_stri = sorted(list_stri, key=lambda x: -len(x)) for comp_stri in list_stri: # to make things faster, breaks whether the # following condition is reached if len(comp_stri) + len_stru <= max_edit_dist: break dist_this_comp = levenshtein(stru, comp_stri) if dist_this_comp > max_edit_dist: argmax_dist = comp_stri max_edit_dist = dist_this_comp return argmax_dist return None