Source code for pm4py.algo.connectors.variants.github_repo

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

import time
import traceback
import pandas as pd
from dateutil.parser import parse
from pm4py.util.dt_parsing.variants import strpfromiso
from typing import Optional, Dict, Any
from enum import Enum
from pm4py.util import exec_utils, pandas_utils
import importlib.util


[docs] class Parameters(Enum): OWNER = "owner" REPOSITORY = "repository" AUTH_TOKEN = "auth_token"
[docs] def apply(parameters: Optional[Dict[Any, str]] = None) -> pd.DataFrame: """ Extracts a dataframe containing the history of the issues of a Github repository. According to the API limit rate of public/registered users, only a part of the events can be returned. Parameters --------------- prameters Parameters of the algorithm, including: - Parameters.OWNER => owner of the repository (e.g., pm4py) - Parameters.REPOSITORY => name of the repository (e.g., pm4py-core) - Parameters.AUTH_TOKEN => authorization token Returns --------------- dataframe Pandas dataframe """ import requests if parameters is None: parameters = {} owner = exec_utils.get_param_value(Parameters.OWNER, parameters, "pm4py") repo = exec_utils.get_param_value( Parameters.REPOSITORY, parameters, "pm4py-core" ) auth_token = exec_utils.get_param_value( Parameters.AUTH_TOKEN, parameters, None ) headers = {} if auth_token is not None: headers["Authorization"] = "Bearer " + auth_token continuee = True page = 0 events = [] progress = None while continuee: page += 1 try: r = requests.get( "https://api.github.com/repos/" + owner + "/" + repo + "/issues?state=all&per_page=100&page=" + str(page), headers=headers, ) issues = r.json() if not issues: continuee = False break if importlib.util.find_spec("tqdm"): from tqdm.auto import tqdm progress = tqdm( total=len(issues), desc="extracting issues of page " + str(page) + ", progress :: ", ) for i in issues: if continuee: if "timeline_url" in i: timeline_url = i["timeline_url"] eve = { "case:owner": owner, "case:repo": owner + "/" + repo, "case:concept:name": timeline_url, "time:timestamp": strpfromiso.fix_naivety( parse(i["created_at"]) ), "concept:name": "created", "org:resource": i["user"]["login"], "case:author_association": i["author_association"], "case:title": i["title"], } if "pull_request" in i: eve["case:pull_request"] = i["pull_request"]["url"] events.append(eve) r2 = requests.get(timeline_url, headers=headers) issue_events = r2.json() issue_events.reverse() for ev in issue_events: if ( "created_at" in ev and "event" in ev and "actor" in ev ): eve = { "case:owner": owner, "case:repo": owner + "/" + repo, "case:concept:name": timeline_url, "time:timestamp": strpfromiso.fix_naivety( parse(ev["created_at"]) ), "concept:name": ev["event"], "org:resource": ev["actor"]["login"], "case:author_association": i[ "author_association" ], "case:title": i["title"], } if "pull_request" in i: eve["case:pull_request"] = i[ "pull_request" ]["url"] events.append(eve) if progress is not None: progress.update() if progress is not None: progress.close() time.sleep(1) except BaseException: continuee = False traceback.print_exc() if progress is not None: progress.close() break dataframe = pandas_utils.instantiate_dataframe(events) if len(dataframe) > 0: dataframe = pandas_utils.insert_index( dataframe, "@@index", copy_dataframe=False, reset_index=False ) dataframe = dataframe.sort_values(["time:timestamp", "@@index"]) dataframe["@@case_index"] = dataframe.groupby( "case:concept:name", sort=False ).ngroup() dataframe = dataframe.sort_values( ["@@case_index", "time:timestamp", "@@index"] ) return dataframe