-
Notifications
You must be signed in to change notification settings - Fork 6
/
releases.py
113 lines (99 loc) · 3.5 KB
/
releases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import logging
import os.path
import pandas as pd
def create_release_stats(studies_file, release=None, date=None, size=None):
studies = pd.read_csv(studies_file, delimiter='\t', encoding='utf-8')
df = pd.DataFrame(columns=(
"Date",
"Data release",
"Code version",
"Sets",
"Wells",
"Experiments",
"Images",
"Planes",
"Size (TB)",
"Files (Million)",
"DB Size (GB)",
"Studies",
))
suffixes = studies['Introduced'].apply(lambda x: int(x[4:]))
if not release:
release_suffix = max(suffixes)
release = "prod%s" % max(suffixes)
else:
release_suffix = int(release[len('prod'):])
index = suffixes <= release_suffix
if not date:
date = "TBD"
if not size:
size = "TBD"
df.loc[0] = (
date,
release,
get_release_code(release_suffix),
int(studies[index]['Sets'].sum()),
int(studies[index]['Wells'].sum()),
"",
int(studies[index]['5D Images'].sum()),
int(studies[index]['Planes'].sum()),
studies[index]['Size (TB)'].sum(),
studies[index]['# of Files'].sum() / 10 ** 6,
size,
studies[index]['Study'].nunique())
return(df)
def print_release_stats(df, fmt, target=None):
# fmt can be any of the pandas.Dataframe.to_{printfmt} methods
if fmt == 'tsv':
if target:
out = df.to_csv(
target, sep='\t', mode='a', header=False, index=False)
return
else:
out = df.to_csv(sep='\t', header=False, index=False)
elif fmt in ('json',):
out = getattr(df, f'to_{fmt}')()
else:
out = getattr(df, f'to_{fmt}')(index=False)
print(out)
def get_release_code(release_suffix):
patch = int(release_suffix) % 10
minor = int((int(release_suffix) - patch) / 10)
return "0.%s.%s" % (minor, patch)
def main():
parser = argparse.ArgumentParser("Generate release statistics")
parser.add_argument(
"--release", default=None, help="Name of the release")
parser.add_argument(
"--release-date", default=None, help="Date of the release")
parser.add_argument(
"--db-size", default=None,
help="Size of the database for the release in GB")
parser.add_argument("--format", default="tsv", help=(
"Output format, includes 'string', 'csv', 'tsv' (default), and "
"'json'. "
"'string' is the most human-readable (fixed width columns). "
"If tsv is selected and a file called releases.tsv exists in the "
"same directory as the file specified by studies_file, the output "
"will be appended to this file."
))
parser.add_argument('-v', '--verbose', action='count', default=0)
parser.add_argument(
"studies_file", help="Path to TSV file containing study stats")
ns = parser.parse_args()
levels = [logging.WARNING, logging.INFO, logging.DEBUG]
level = levels[min(len(levels)-1, ns.verbose)]
logging.basicConfig(
level=level, format="%(asctime)s %(levelname)s %(message)s")
df = create_release_stats(
ns.studies_file, release=ns.release, date=ns.release_date,
size=ns.db_size)
releases_file = os.path.join(
os.path.dirname(ns.studies_file), 'releases.tsv')
if os.path.exists(releases_file):
print_release_stats(df, ns.format, target=releases_file)
else:
print_release_stats(df, ns.format)
if __name__ == "__main__":
main()