From e5fcc9bbf3c65f9678efea8fe4fe715205bc68db Mon Sep 17 00:00:00 2001 From: Stefan van der Walt Date: Tue, 23 May 2023 17:06:08 -0700 Subject: [PATCH 1/2] Embed standard queries in library --- .gitignore | 2 + devstats/__init__.py | 58 +++++++++++------- .../queries/issue_activity_since_date.gql | 60 +++++++++++++++++++ devstats/queries/pr_data_query.gql | 32 ++++++++++ pyproject.toml | 3 + 5 files changed, 135 insertions(+), 20 deletions(-) create mode 100644 devstats/queries/issue_activity_since_date.gql create mode 100644 devstats/queries/pr_data_query.gql diff --git a/.gitignore b/.gitignore index 68bc17f..ba1b4ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*~ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/devstats/__init__.py b/devstats/__init__.py index 2b7cee5..346cc5a 100644 --- a/devstats/__init__.py +++ b/devstats/__init__.py @@ -3,6 +3,8 @@ import sys import json import click +from glob import glob +import re try: token = os.environ["GRAPH_API_KEY"] @@ -102,16 +104,17 @@ def get_all_responses(query, query_type): Helper function to bypass GitHub GraphQL API node limit. """ # Get data from a single response + print(f"Retrieving first page...", end="", flush=True) initial_data = send_query(query, query_type) data, last_cursor, total_count = parse_single_query(initial_data, query_type) - print(f"Retrieving {len(data)} out of {total_count} values...") + # Continue requesting data (with pagination) until all are acquired while len(data) < total_count: rdata = send_query(query, query_type, cursor=last_cursor) pdata, last_cursor, _ = parse_single_query(rdata, query_type) data.extend(pdata) - print(f"Retrieving {len(data)} out of {total_count} values...") - print("Done.") + print(f"OK\nRetrieving {len(data)} out of {total_count} values...", end="", flush=True) + print("OK") return data @@ -187,6 +190,7 @@ def dump(self, outfile): raise ValueError("raw_data is currently empty, nothing to dump") with open(outfile, "w") as outf: + print(f'Writing [{outfile}]') json.dump(self.raw_data, outf) @@ -195,24 +199,38 @@ def dump(self, outfile): @click.argument("repo_name") def main(repo_owner, repo_name): """Download and save issue and pr data for `repo_owner`/`repo_name`.""" - # Download issue data - issues = GithubGrabber( - "query_examples/issue_activity_since_date.gql", - "issues", - repo_owner=repo_owner, - repo_name=repo_name, - ) - issues.get() - issues.dump(f"{repo_name}_issues.json") - # Download PR data - prs = GithubGrabber( - "query_examples/pr_data_query.gql", - "pullRequests", - repo_owner=repo_owner, - repo_name=repo_name, + + query_files = glob( + os.path.join(os.path.dirname(__file__), 'queries/*.gql') ) - prs.get() - prs.dump(f"{repo_name}_prs.json") + + for n, query in enumerate(query_files): + if n != 0: + print() + + print(f"Query: [{os.path.basename(query)}] on [{repo_owner}/{repo_name}]") + # Parse query type from gql + gql = open(query).read() + qtype_match = re.match( + 'query\s*{\s*repository\(.*?\)\s*{\s*(pullRequests|issues)', + gql, flags=re.MULTILINE + ) + if qtype_match is None: + print(f"Could not determine gql query type for {query}") + sys.exit(-1) + else: + qtype = qtype_match.group(1) + + qname, qext = os.path.splitext(query) + data = GithubGrabber( + query, + qtype, + repo_owner=repo_owner, + repo_name=repo_name, + ) + data.get() + ftype = {'issues': 'issues', 'pullRequests': 'PRs'} + data.dump(f"{repo_name}_{ftype.get(qtype, qtype)}.json") if __name__ == "__main__": diff --git a/devstats/queries/issue_activity_since_date.gql b/devstats/queries/issue_activity_since_date.gql new file mode 100644 index 0000000..2301975 --- /dev/null +++ b/devstats/queries/issue_activity_since_date.gql @@ -0,0 +1,60 @@ +query { + repository(owner: "_REPO_OWNER_", name: "_REPO_NAME_") { + issues(first: 100, filterBy: {since: "2020-01-01T00:00:00Z"}) { + totalCount + edges { + cursor + node { + number + title + createdAt + state + closedAt + updatedAt + url + labels(first: 100) { + edges { + node { + name + } + } + } + timelineItems(first: 100, itemTypes: [CROSS_REFERENCED_EVENT, ISSUE_COMMENT, CLOSED_EVENT]) { + totalCount + edges { + node { + __typename + ... on CrossReferencedEvent { + source { + ... on Issue { + __typename + number + closed + } + ... on PullRequest { + __typename + number + closed + } + } + isCrossRepository + } + ... on IssueComment { + author { + login + } + createdAt + } + ... on ClosedEvent { + actor { + login + } + } + } + } + } + } + } + } + } +} diff --git a/devstats/queries/pr_data_query.gql b/devstats/queries/pr_data_query.gql new file mode 100644 index 0000000..55274db --- /dev/null +++ b/devstats/queries/pr_data_query.gql @@ -0,0 +1,32 @@ +query { + repository(owner:"_REPO_OWNER_", name:"_REPO_NAME_") { + pullRequests(first:100) { + totalCount + edges { + cursor + node{ + number + state + title + createdAt + baseRefName + mergeable + author{ + login + } + authorAssociation + mergedBy{ + login + } + mergedAt + reviews(states:APPROVED){ + totalCount + } + participants(first:100){ + totalCount + } + } + } + } + } +} diff --git a/pyproject.toml b/pyproject.toml index 2704b3e..e513597 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,3 +35,6 @@ homepage = "https://github.com/scientific-python/devstats" [tool.setuptools.packages.find] include = ["devstats*"] + +[tool.setuptools.package-data] +"*" = ["*.gql"] From 486ece607bda11facf8043469e4188e18510107c Mon Sep 17 00:00:00 2001 From: Jarrod Millman Date: Tue, 23 May 2023 17:20:12 -0700 Subject: [PATCH 2/2] Fix linting --- devstats/__init__.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/devstats/__init__.py b/devstats/__init__.py index 346cc5a..17d9d9a 100644 --- a/devstats/__init__.py +++ b/devstats/__init__.py @@ -113,7 +113,11 @@ def get_all_responses(query, query_type): rdata = send_query(query, query_type, cursor=last_cursor) pdata, last_cursor, _ = parse_single_query(rdata, query_type) data.extend(pdata) - print(f"OK\nRetrieving {len(data)} out of {total_count} values...", end="", flush=True) + print( + f"OK\nRetrieving {len(data)} out of {total_count} values...", + end="", + flush=True, + ) print("OK") return data @@ -190,7 +194,7 @@ def dump(self, outfile): raise ValueError("raw_data is currently empty, nothing to dump") with open(outfile, "w") as outf: - print(f'Writing [{outfile}]') + print(f"Writing [{outfile}]") json.dump(self.raw_data, outf) @@ -200,9 +204,7 @@ def dump(self, outfile): def main(repo_owner, repo_name): """Download and save issue and pr data for `repo_owner`/`repo_name`.""" - query_files = glob( - os.path.join(os.path.dirname(__file__), 'queries/*.gql') - ) + query_files = glob(os.path.join(os.path.dirname(__file__), "queries/*.gql")) for n, query in enumerate(query_files): if n != 0: @@ -212,8 +214,9 @@ def main(repo_owner, repo_name): # Parse query type from gql gql = open(query).read() qtype_match = re.match( - 'query\s*{\s*repository\(.*?\)\s*{\s*(pullRequests|issues)', - gql, flags=re.MULTILINE + r"query\s*{\s*repository\(.*?\)\s*{\s*(pullRequests|issues)", + gql, + flags=re.MULTILINE, ) if qtype_match is None: print(f"Could not determine gql query type for {query}") @@ -229,7 +232,7 @@ def main(repo_owner, repo_name): repo_name=repo_name, ) data.get() - ftype = {'issues': 'issues', 'pullRequests': 'PRs'} + ftype = {"issues": "issues", "pullRequests": "PRs"} data.dump(f"{repo_name}_{ftype.get(qtype, qtype)}.json")