Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evolution metrics implementation #285

Merged
merged 5 commits into from
May 31, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 127 additions & 7 deletions augur/datasources/augur_db/augur_db.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#SPDX-License-Identifier: MIT
"""
Data source that uses the Augur relational database of GitHub activity.
Data source that uses the Augur relational database of GitHub activity.
"""

import pandas as pd
Expand Down Expand Up @@ -29,7 +29,7 @@ def __init__(self, user, password, host, port, dbname, schema):
connect_args={'options': '-csearch_path={}'.format(schema)})

logger.debug('GHTorrent: Connecting to {} schema of {}:{}/{} as {}'.format(schema, host, port, dbname, user))

# try:
# self.userid('howderek')
# except Exception as e:
Expand All @@ -56,31 +56,151 @@ def code_changes(self, repo_url, period='day', begin_date=None, end_date=None):
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

code_changes_SQL = s.sql.text("""
SELECT date_trunc(:period, cmt_committer_date::DATE) as commit_date, COUNT(cmt_id)
SELECT date_trunc(:period, cmt_committer_date::DATE) as commit_date, COUNT(cmt_id)
FROM commits
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND cmt_committer_date BETWEEN :begin_date AND :end_date
GROUP BY commit_date
ORDER BY commit_date;
""")

results = pd.read_sql(code_changes_SQL, self.db, params={'repourl': '%{}%'.format(repo_url), 'period': period,
results = pd.read_sql(code_changes_SQL, self.db, params={'repourl': '%{}%'.format(repo_url), 'period': period,
'begin_date': begin_date, 'end_date': end_date})
return results

@annotate(tag='code-changes-lines')
def code_changes_lines(self, repo_url, period='day', begin_date=None, end_date=None):
"""Returns a timeseries of code changes added and removed.

:param repo_url: The repository's URL
:param period: To set the periodicity to 'day', 'week', 'month', or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of code changes/period
"""
if not begin_date:
begin_date = '1970-1-1 00:00:00'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

code_changes_lines_SQL = s.sql.text("""
SELECT date_trunc(:period, cmt_author_date::DATE) as commit_date, SUM(cmt_added) AS added, SUM(cmt_removed) as removed
FROM commits
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND cmt_author_date BETWEEN :begin_date AND :end_date
GROUP BY commit_date
ORDER BY commit_date;
""")

results = pd.read_sql(code_changes_lines_SQL, self.db, params={'repourl': '%{}%'.format(repo_url), 'period': period,
'begin_date': begin_date, 'end_date': end_date})
return results

@annotate(tag='issues-new')
def issues_new(self, repo_url, period='day', begin_date=None, end_date=None):
"""Returns a timeseries of new issues opened.

:param repo_url: The repository's URL
:param period: To set the periodicity to 'day', 'week', 'month', or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of new issues/period
"""
if not begin_date:
begin_date = '1970-1-1 00:00:00'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

issues_new_SQL = s.sql.text("""
SELECT date_trunc(:period, created_at::DATE) as issue_date, COUNT(issue_id) as issues
FROM issues
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND created_at BETWEEN :begin_date AND :end_date
GROUP BY issue_date
ORDER BY issue_date;
""")

results = pd.read_sql(issues_new_SQL, self.db, params={'repourl': '%{}%'.format(repo_url), 'period': period,
'begin_date': begin_date, 'end_date': end_date})
return results


@annotate(tag='issues-closed')
def issues_closed(self, repo_url, period='day', begin_date=None, end_date=None):
"""Returns a timeseries of issues closed.

:param repo_url: The repository's URL
:param period: To set the periodicity to 'day', 'week', 'month', or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of issues closed/period
"""
if not begin_date:
begin_date = '1970-1-1 00:00:00'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

issues_closed_SQL = s.sql.text("""
SELECT date_trunc(:period, closed_at::DATE) as issue_close_date, COUNT(issue_id) as issues
FROM issues
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND closed_at IS NOT NULL AND closed_at BETWEEN :begin_date AND :end_date
GROUP BY issue_close_date
ORDER BY issue_close_date;
""")

results = pd.read_sql(issues_closed_SQL, self.db, params={'repourl': '%{}%'.format(repo_url), 'period': period,
'begin_date': begin_date, 'end_date': end_date})
return results

@annotate(tag='issue-duration')
def issue_duration(self, repo_url):
"""Returns the duration of each issue.

:param repo_url: The repository's URL
:return: DataFrame of issue id with the corresponding duration
"""
issue_duration_SQL = s.sql.text("""
SELECT issue_id, (closed_at - created_at) AS duration
FROM issues
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND closed_at IS NOT NULL
ORDER BY issue_id;
""")

results = pd.read_sql(issue_duration_SQL, self.db, params={'repourl': f'%{repo_url}%'})
return results

@annotate(tag='issue-backlog')
def issues_backlog(self, repo_url):
"""Returns number of issues currently open.

:param repo_url: The repository's URL
:return: DataFrame of count of issues currently open.
"""
issues_backlog_SQL = s.sql.text("""
SELECT COUNT(*)
FROM issues
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
AND issue_state='open'
""")

result = pd.read_sql(issues_backlog_SQL, self.db, params={'repourl': f'%{repo_url}%'})
return result

#####################################
### EXPERIMENTAL ###
#####################################

@annotate(tag='lines-changed-by-author')
def lines_changed_by_author(self, repo_url):
"""
Returns number of lines changed per author per day
Returns number of lines changed per author per day

:param repo_url: the repository's URL
"""
linesChangedByAuthorSQL = s.sql.text("""
SELECT cmt_author_email, cmt_author_date, cmt_author_affiliation as affiliation,
SELECT cmt_author_email, cmt_author_date, cmt_author_affiliation as affiliation,
SUM(cmt_added) as additions, SUM(cmt_removed) as deletions, SUM(cmt_whitespace) as whitespace
FROM commits
WHERE repo_id = (SELECT repo_id FROM repo WHERE repo_git LIKE :repourl LIMIT 1)
Expand Down