'''
PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.
Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''
import time
import traceback
import pandas as pd
from dateutil.parser import parse
from pm4py.util.dt_parsing.variants import strpfromiso
from typing import Optional, Dict, Any
from enum import Enum
from pm4py.util import exec_utils, pandas_utils
import importlib.util
[docs]
class Parameters(Enum):
OWNER = "owner"
REPOSITORY = "repository"
AUTH_TOKEN = "auth_token"
[docs]
def apply(parameters: Optional[Dict[Any, str]] = None) -> pd.DataFrame:
"""
Extracts a dataframe containing the history of the issues of a Github repository.
According to the API limit rate of public/registered users, only a part of the events
can be returned.
Parameters
---------------
prameters
Parameters of the algorithm, including:
- Parameters.OWNER => owner of the repository (e.g., pm4py)
- Parameters.REPOSITORY => name of the repository (e.g., pm4py-core)
- Parameters.AUTH_TOKEN => authorization token
Returns
---------------
dataframe
Pandas dataframe
"""
import requests
if parameters is None:
parameters = {}
owner = exec_utils.get_param_value(Parameters.OWNER, parameters, "pm4py")
repo = exec_utils.get_param_value(
Parameters.REPOSITORY, parameters, "pm4py-core"
)
auth_token = exec_utils.get_param_value(
Parameters.AUTH_TOKEN, parameters, None
)
headers = {}
if auth_token is not None:
headers["Authorization"] = "Bearer " + auth_token
continuee = True
page = 0
events = []
progress = None
while continuee:
page += 1
try:
r = requests.get(
"https://api.github.com/repos/"
+ owner
+ "/"
+ repo
+ "/issues?state=all&per_page=100&page="
+ str(page),
headers=headers,
)
issues = r.json()
if not issues:
continuee = False
break
if importlib.util.find_spec("tqdm"):
from tqdm.auto import tqdm
progress = tqdm(
total=len(issues),
desc="extracting issues of page "
+ str(page)
+ ", progress :: ",
)
for i in issues:
if continuee:
if "timeline_url" in i:
timeline_url = i["timeline_url"]
eve = {
"case:owner": owner,
"case:repo": owner + "/" + repo,
"case:concept:name": timeline_url,
"time:timestamp": strpfromiso.fix_naivety(
parse(i["created_at"])
),
"concept:name": "created",
"org:resource": i["user"]["login"],
"case:author_association": i["author_association"],
"case:title": i["title"],
}
if "pull_request" in i:
eve["case:pull_request"] = i["pull_request"]["url"]
events.append(eve)
r2 = requests.get(timeline_url, headers=headers)
issue_events = r2.json()
issue_events.reverse()
for ev in issue_events:
if (
"created_at" in ev
and "event" in ev
and "actor" in ev
):
eve = {
"case:owner": owner,
"case:repo": owner + "/" + repo,
"case:concept:name": timeline_url,
"time:timestamp": strpfromiso.fix_naivety(
parse(ev["created_at"])
),
"concept:name": ev["event"],
"org:resource": ev["actor"]["login"],
"case:author_association": i[
"author_association"
],
"case:title": i["title"],
}
if "pull_request" in i:
eve["case:pull_request"] = i[
"pull_request"
]["url"]
events.append(eve)
if progress is not None:
progress.update()
if progress is not None:
progress.close()
time.sleep(1)
except BaseException:
continuee = False
traceback.print_exc()
if progress is not None:
progress.close()
break
dataframe = pandas_utils.instantiate_dataframe(events)
if len(dataframe) > 0:
dataframe = pandas_utils.insert_index(
dataframe, "@@index", copy_dataframe=False, reset_index=False
)
dataframe = dataframe.sort_values(["time:timestamp", "@@index"])
dataframe["@@case_index"] = dataframe.groupby(
"case:concept:name", sort=False
).ngroup()
dataframe = dataframe.sort_values(
["@@case_index", "time:timestamp", "@@index"]
)
return dataframe