Source code for pm4py.util.labels_similarity
'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
import difflib
from typing import List, Dict, Set
# ---------------------------------------------------------------------------
# 1. Similarity between two (unordered) label collections
# ---------------------------------------------------------------------------
[docs]
def label_sets_similarity(
labels_a: List[str],
labels_b: List[str],
threshold: float = 0.75,
) -> float:
"""
Dice-style similarity ∈ [0,1] that tolerates near-matches.
* Each label in A is greedily paired with its most similar,
still-unmatched label in B (SequenceMatcher ratio ≥ `threshold`).
* Similarity = 2·|matched| / (|A| + |B|).
"""
remaining_b: Set[str] = set(labels_b)
matches = 0
for la in labels_a:
if not remaining_b: # B exhausted
break
# find best candidate among the still-free labels in B
best, score = None, 0.0
for lb in remaining_b:
r = difflib.SequenceMatcher(None, la, lb).ratio()
if r > score:
best, score = lb, r
if score >= threshold:
matches += 1
remaining_b.remove(best)
return 2 * matches / (len(labels_a) + len(labels_b)) if (labels_a or labels_b) else 1.0
# ---------------------------------------------------------------------------
# 2. Greedy label mapping A → B (or A → A if no good match)
# ---------------------------------------------------------------------------
[docs]
def map_labels(
labels_a: List[str],
labels_b: List[str],
threshold: float = 0.75,
) -> Dict[str, str]:
"""
Returns a dict {label_in_A: mapped_label}.
* If an exact duplicate exists in B, map to that.
* Otherwise pick the *unused* label in B with the highest similarity
(SequenceMatcher ratio ≥ `threshold`).
* If nothing reaches the threshold, map to itself (no change).
"""
mapping: Dict[str, str] = {}
used_b: Set[str] = set()
for la in labels_a:
# 1) perfect match first
if la in labels_b and la not in used_b:
mapping[la] = la
used_b.add(la)
continue
# 2) best fuzzy match
best, score = None, 0.0
for lb in labels_b:
if lb in used_b:
continue
r = difflib.SequenceMatcher(None, la, lb).ratio()
if r > score:
best, score = lb, r
if best is not None and score >= threshold:
mapping[la] = best
used_b.add(best)
else:
mapping[la] = la # fallback: keep original
return mapping