Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

function headers are displayed in a separate table cell #5

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions commits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@

import datetime
import enum
import logging
import chardet
import tqdm
import subprocess
import re
import sys
import dataclasses
from typing import List, Set, Tuple, Optional, Any
from modification import Modification
from javadoc_analyzer import has_java_javadoc_changed


_commit_line = re.compile(r'^commit ([0-9a-f]{40})$')
_date_line = re.compile(r'^Date:\s*([0-9\-]+T[0-9\:]+)')
_src_line = re.compile(r'^M\t((.+)\.java)$')

@enum.unique
class CommitType(enum.Enum):
UNKNOWN = None
JAVA_AND_JAVADOC_TAGS_EVERYWHERE = "Arbitrary Java / JavaDoc changes"
ONLY_JAVADOC_TAGS_IN_SOME_FILES = "Some files have only JavaDoc tag changes"
ONLY_JAVADOC_TAGS_EVERYWHERE = "Whole commit has only JavaDoc tag changes"
WITHOUT_JAVADOC_TAGS = "Commit doesn't have JavaDoc tag changes"

_mixed_commits: int = 0
_only_javadoc_in_some_files_commits: int = 0
_pure_javadoc_commits: int = 0
_total_commits: int = 0
_java_files_commits: int = 0

@dataclasses.dataclass()
class Commit:
sha1: str
files: List[Optional[str]] = None
date: datetime = None
commit_type: CommitType = CommitType.UNKNOWN
file_statuses: List[Tuple[bool, bool, bool]] = None
modifications: List[Modification] = None

@staticmethod
def read_file_in_any_encoding(patch_filename: str, filename: str, comment: str = "") -> str:
with open(patch_filename, 'rb') as bf:
bts = bf.read()
try:
return bts.decode('utf-8')
except Exception as ude1:
logging.warning(f"File: {filename} of {comment} is not in UTF-8: {ude1}")
try:
return bts.decode(sys.getdefaultencoding())
except Exception as ude2:
logging.warning(f"File: {filename} of {comment} is not in sys.getdefaultencoding() = {sys.getdefaultencoding()}: {ude2}")
# Can't handle more here...
enc = chardet.detect(bts)['encoding']
logging.warning(f"File: {filename} of {comment} is likely in {enc} encoding")
return bts.decode(enc)

def classify(self, tmpdir):
global _mixed_commits, _only_javadoc_in_some_files_commits, _pure_javadoc_commits

file_statuses: List[Tuple[bool, bool, bool]] = []
modifications: List[Modification] = []

for f in self.files:
patchname = subprocess.check_output([
'git', 'format-patch', '-1', '--numbered-files', '--unified=100000',
'-o', tmpdir, self.sha1,
'--', f
]).decode(sys.getdefaultencoding()).strip()
try:
patch = self.read_file_in_any_encoding(patchname, f, f"Commit: {self.sha1}")
tuple_ = has_java_javadoc_changed(f, patch, self.date, self.sha1)
file_statuses.append((tuple_[0], tuple_[1], tuple_[2]))
if tuple_[2] and not tuple_[0] and not tuple_[1]:
modifications.extend(tuple_[3])
except Exception as e:
logging.error("Skipping bad patch of commit %s in file %s due to %s" % (self.sha1, f, e))
file_statuses.append((False, False, False))

pure_javadoc_tag_files_count = sum(
1 for (j, d, t) in file_statuses if t and not j and not d
)

javadoc_tag_files_count = sum(
1 for (j, d, t) in file_statuses if t
)

if pure_javadoc_tag_files_count == len(file_statuses):
self.commit_type = CommitType.ONLY_JAVADOC_TAGS_EVERYWHERE
_pure_javadoc_commits += 1
elif pure_javadoc_tag_files_count > 0:
self.commit_type = CommitType.ONLY_JAVADOC_TAGS_IN_SOME_FILES
_only_javadoc_in_some_files_commits += 1
elif javadoc_tag_files_count == 0:
self.commit_type = CommitType.WITHOUT_JAVADOC_TAGS
else:
self.commit_type = CommitType.JAVA_AND_JAVADOC_TAGS_EVERYWHERE
_mixed_commits += 1

self.file_statuses = file_statuses
self.modifications = modifications


# def get_file_statuses_str(self) -> str:
# res = []
# for f, (j, d, t, s) in zip(self.files, self.file_statuses):
# if len(s):
# res.append("%s:\n%s\n" % (f, s))
# return "\n".join(res)

def get_csv_lines(self, url_prefix: str) -> List[List[str]]:
if not self.modifications:
return [[self.commit_type.value, url_prefix + self.sha1, self.date, '', '']]
csv_lines = []
for i in range(0, len(self.modifications)):
csv_lines.append(self.csv_line(i, url_prefix))
return csv_lines

def csv_line(self, i: int, url_prefix: str) -> List[str]:
if i < 1:
if self.modifications[0].time_offset is None:
return [
self.commit_type.value,
url_prefix + self.sha1,
self.date,
self.modifications[0].file_name,
self.modifications[0].javadoc_modification,
self.modifications[0].functionheader_modification,
self.modifications[0].functionheader_date,
''
]
return [
self.commit_type.value,
url_prefix + self.sha1,
self.date,
self.modifications[0].file_name,
self.modifications[0].javadoc_modification,
self.modifications[0].functionheader_modification,
self.modifications[0].functionheader_date,
self.modifications[0].time_offset.days
]
else:
if self.modifications[i].time_offset is None:
return [
'',
'',
'',
self.modifications[i].file_name,
self.modifications[i].javadoc_modification,
self.modifications[i].functionheader_modification,
self.modifications[i].functionheader_date,
''
]
return [
'',
'',
'',
self.modifications[i].file_name,
self.modifications[i].javadoc_modification,
self.modifications[i].functionheader_modification,
self.modifications[i].functionheader_date,
self.modifications[i].time_offset.days
]


def get_commits(single_commit: Optional[str] = None) -> List[Commit]:
global _total_commits

git_cmd = [
'git', 'show', '--name-status', '--date=iso-strict', single_commit
] if single_commit else [
'git', 'log', '--name-status', '--date=iso-strict', '--all'
]

log = subprocess.check_output(git_cmd).decode(sys.getdefaultencoding())
log = log.replace('\r', '')
loglines = log.split('\n')
commits = []
cur_commit = None
cur_date = None
cur_files = []

def release():
global _java_files_commits
if cur_commit and len(cur_files):
_java_files_commits += 1
cur_realdatetime = datetime.datetime.strptime(cur_date, "%Y-%m-%dT%H:%M:%S")
commits.append(Commit(cur_commit, cur_files.copy(), cur_realdatetime))

print("Analyzing log...")

for l in tqdm.tqdm(loglines):
clm = _commit_line.match(l)
clf = _src_line .match(l)
cld = _date_line.match(l)
if clm:
_total_commits += 1
release()
cur_commit = clm.group(1)
cur_files = []
elif cld:
cur_date = cld.group(1)
elif clf:
cur_files.append(clf.group(1))
release()
return commits
160 changes: 160 additions & 0 deletions javadoc_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from typing import List, Set, Tuple, Optional, Any
import re
import logging
import datetime
import itertools
from modification import Modification, find_modification_before


_javadoc_start_marker = re.compile(r'^((\+|\-)( |\t))?\s*/\*\*\s*')
_javadoc_end_marker = re.compile(r'^.*(\*/|\*\s*\*/)\s*$')
_javadoc_section_marker = re.compile(r'^((\+|\-)( |\t))?\s*(\*|/\*\*)?\s*@(param|return|exception|throw|throws)\s+')
_javadoc_uninteresting_tags = re.compile(r'^((\+|\-)( |\t))?\s*(\*|/\*\*)?\s*@(author|deprecated|see|since|version|serial)\s+')

_patch_plus_prefix = re.compile(r'^\+( |\t)')
_patch_minus_prefix = re.compile(r'^\-( |\t)')
_patch_plus_minus_prefix = re.compile(r'^(\+|\-)( |\t)?')
_patch_plus_minus_asterisk_prefix = re.compile(r'^(\+|\-)( |\t)*\*\s*$')
_function_headers = re.compile(r'^\s*(@\w+)*\s*(\w|\s|\[|\]|<|>|\?|,|\.|(\/\*\w+\*\/))+\((\w|\s|,|\.|\[|\]|<|>|\?|(\/\*\w+\*\/))*\)(\w|\s|,)*(\{|\;)')
whitespaces = re.compile(r'(\s)+')
_empty_line = re.compile(r'^(\+|\-)?( |\t)*\s*$')

def only_whitespaces(deleted: str, added: str) -> bool:
deleted_without_whitspaces = whitespaces.sub('', deleted)
added_without_whitespaces = whitespaces.sub('', added)
return deleted_without_whitspaces == added_without_whitespaces

# @numba.jit()
def has_java_javadoc_changed(file_name: str, patch: str, commit_date: datetime, sha: str, linecontext: int = 3) -> Tuple[bool, bool, bool, List[Modification]]:
patchlines = patch.replace('\r', '').split('\n')

has_javadoc_tag_changed = False
has_javadoc_changed = False
has_java_changed = False

javadoc_lines_before = ''
javadoc_lines_after = ''
tag_lines_before = ''
tag_lines_after = ''

#interesting_line_indices: List[bool] = [False] * len(patchlines)

modifications_in_file: List[Modification] = []
javadoc_mod = ''
functionheader_mod = ''

going = False
in_javadoc = False
in_javadoc_tag_section = False
in_javadoc_end = False
tag_line = False
lookfor_code = False
lookfor_first_codeline = False
lookfor_endtag = False
linecode_list = []
linedoc_list = []
start_header = ''
for l, ln in zip(patchlines, itertools.count()):
in_javadoc_end = False
tag_line = False
if (lookfor_first_codeline and not _empty_line.match(l)) or lookfor_code:
if lookfor_first_codeline:
start_header = l.lstrip()
lookfor_first_codeline = False
lookfor_code = True
linecode_list.append(l)
lines_ = "".join(linecode_list)
match = _function_headers.search(lines_)
if match:
lookfor_code = False
lookfor_first_codeline = False
number_of_lines = len(linecode_list)
functionheader_mod = '\n'.join(k for k in linecode_list)
javadoc_mod = '\n'.join(k for k in linedoc_list)
linecode_list = []
linedoc_list = []
modification_before = find_modification_before(file_name, start_header, number_of_lines, sha, commit_date)
offset = commit_date-modification_before
modifications_in_file.append(Modification(file_name, javadoc_mod, functionheader_mod, modification_before, offset))
elif len(linecode_list) > 9:
lookfor_code = False
lookfor_first_codeline = False
javadoc_mod = '\n'.join(k for k in linedoc_list)
linecode_list = []
linedoc_list = []
modifications_in_file.append(Modification(file_name, javadoc_mod, None, None, None))
if l.startswith('@@'):
going = True
elif l.startswith('--'):
going = False
elif going and not in_javadoc and _javadoc_start_marker.match(l):
in_javadoc = True
if going and in_javadoc and not in_javadoc_tag_section and _javadoc_section_marker.match(l):
tag_line = True
in_javadoc_tag_section = True
lookfor_code = False
lookfor_endtag = False
linecode_list = []
linedoc_list = []
elif going and in_javadoc_tag_section and _javadoc_uninteresting_tags.match(l):
in_javadoc_tag_section = False
if going and in_javadoc and _javadoc_end_marker.match(l):
in_javadoc = False
in_javadoc_tag_section = False
in_javadoc_end = True
if lookfor_endtag:
lookfor_endtag = False
lookfor_first_codeline = True
linecode_list = []
if going and _patch_plus_minus_prefix.match(l):
if _patch_plus_minus_asterisk_prefix.match(l):
continue
if in_javadoc_tag_section or in_javadoc_end:
if in_javadoc_tag_section or in_javadoc_end and tag_line:
has_javadoc_tag_changed = True
# interesting_line_indices[ln] = True
linedoc_list.append(l)
#for zi in range(max(0, ln - linecontext), min(len(patchlines), ln + linecontext) + 1):
# interesting_line_indices[zi] = True
if _patch_minus_prefix.match(l):
tag_lines_before = tag_lines_before + l[2:]
elif _patch_plus_prefix.match(l):
tag_lines_after = tag_lines_after + l[2:]
if in_javadoc_tag_section:
lookfor_endtag = True
elif tag_line:
lookfor_first_codeline = True
linecode_list = []
elif in_javadoc:
has_javadoc_changed = True
if _patch_minus_prefix.match(l):
javadoc_lines_before = javadoc_lines_before + l[2:]
elif _patch_plus_prefix.match(l):
javadoc_lines_after = javadoc_lines_after + l[2:]
else:
has_java_changed = True
lookfor_code = False
lookfor_first_codeline = False
linecode_list = []
linedoc_list = []
else:
if in_javadoc_tag_section:
tag_lines_before = tag_lines_before + l[2:]
tag_lines_after = tag_lines_after + l[2:]
elif in_javadoc:
javadoc_lines_before = javadoc_lines_before + l[2:]
javadoc_lines_after = javadoc_lines_after + l[2:]

if only_whitespaces(javadoc_lines_before, javadoc_lines_after):
has_javadoc_changed = False
if only_whitespaces(tag_lines_before, tag_lines_after):
has_javadoc_tag_changed = False

#if has_javadoc_tag_changed and not has_java_changed:
# brief = '\n'.join(
# l for l, n in zip(patchlines, interesting_line_indices) if n
# )
#else:
# brief = ""

return has_java_changed, has_javadoc_changed, has_javadoc_tag_changed, modifications_in_file
Loading