-
Notifications
You must be signed in to change notification settings - Fork 7
/
Home.py
173 lines (154 loc) · 7.92 KB
/
Home.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import pandas as pd
import re
import numpy as np
import ParsingModule
import warnings
warnings.filterwarnings("ignore")
import os
import json
import gzip
local_dir = os.path.dirname(__file__)
from PIL import Image
import base64
import io
st.set_page_config(
page_title="lesSDRF",
layout="wide",
menu_items={
"Get help": "https://github.com/compomics/lesSDRF/issues",
"Report a bug": "https://github.com/compomics/lesSDRF/issues",
},
)
def add_logo(logo_path, width, height):
"""Read and return a resized logo"""
logo = Image.open(logo_path)
modified_logo = logo.resize((width, height))
return modified_logo
def get_base64_image(image):
img_buffer = io.BytesIO()
image.save(img_buffer, format="PNG")
img_str = base64.b64encode(img_buffer.getvalue()).decode()
return img_str
my_logo = add_logo(logo_path="final_logo.png", width=149, height=58)
st.markdown(
f"""
<style>
[data-testid="stSidebarNav"] {{
background-image: url('data:image/png;base64,{get_base64_image(my_logo)}');
background-repeat: no-repeat;
background-position: 40px 20px;
}}
</style>
""",
unsafe_allow_html=True,
)
#get local directory using os, and add the data folder to the path
# use streamlit cache data to load gzipped jsons files from folder
@st.cache_data
def load_data():
local_dir = os.path.dirname(__file__)
folder_path = os.path.join(local_dir, "data")
unimod_path = os.path.join(local_dir, "ontology", "unimod.csv")
data = {}
for filename in os.listdir(folder_path):
# do not load the files containing the following names: archae, bacteria, eukaryota, virus, unclassified, other sequences
if re.search(r"archaea|bacteria|eukaryota|virus|unclassified|other sequences", filename):
continue
file_path = os.path.join(folder_path, filename)
if filename.endswith(".json.gz"):
try:
with gzip.open(file_path, "rb") as f:
try:
json_bytes = f.read()
json_str = json_bytes.decode('utf-8')
file_data = json.loads(json_str)
filename_key = filename.replace(".json.gz", "")
data[filename_key] = file_data
except json.JSONDecodeError:
st.write(f"Error decoding JSON in file {file_path}")
except gzip.BadGzipFile:
st.write(f"Error reading file {file_path}: not a gzipped file")
else:
st.write(f"Skipping file {file_path}: not a gzipped file")
unimod = pd.read_csv(unimod_path, sep="\t")
return data, unimod
data_dict, unimod = load_data()
if "data_dict" not in st.session_state:
st.session_state["data_dict"] = data_dict
if "unimod" not in st.session_state:
st.session_state["unimod"] = unimod
st.title("Welcome to lesSDRF")
st.subheader("Spending less time on SDRF creates more time for amazing research")
st.write(
"""By providing metadata in a machine-readable format, other researchers can access your data more easily and you maximize its impact.
The Sample and Data Relationship Format ([SDRF](https://www.nature.com/articles/s41467-021-26111-3)) is the HUPO-PSI recognized metadata format within proteomics. lesSDRF will streamline this annotation process for you. This tool is developed by the [CompOmics group](https://compomics.com/) and published in [Nature Communications](https://www.nature.com/articles/s41467-023-42543-5). \n""")
st.write("""
On this homepage, select the species-specific default SDRF file that matches your study and provide the raw file names.
Then, follow the steps in the sidebar.
- Step 1: If you have a local metadata file, you can upload it to map to the SDRF file
- Step 2: Provide information on potential labels in your sample
- Step 3: Fill in the columns that are required for a valid SDRF
- Step 4: Fill in columns with additional information to further optimise your SDRF file
- Step 5: For atypical experiment types, you can check community suggested columns \n
"""
)
st.markdown("""You are able to download your intermediate file at any given time, so you can come back to the other steps whenever suits you.
Upload your intermediate SDRF file here:""")
upload_df = st.file_uploader(
"Upload intermediate SDRF file", type=["tsv"], accept_multiple_files=False, help='Upload a previously saved SDRF file. It should be in tsv format and should not contain more than 250 samples'
)
if upload_df is not None:
template_df = pd.read_csv(upload_df, sep='\t')
if template_df.shape[0]>250:
st.error('Too many samples, please upload a maximum of 250 samples')
else:
st.write(template_df)
st.session_state["template_df"] = template_df
st.markdown("""In need of some inspiration? Download this example SDRF file to get an idea of the required output""")
with open(f'{local_dir}/example_SDRF.tsv', 'rb') as f:
st.download_button("Download example SDRF", f, file_name="example.sdrf.tsv")
st.subheader("Start here with a completely new SDRF file")
species = ["","human", "cell-line", "default", "nonvertebrates", "plants", "vertebrates"]
selected_species = st.selectbox("""Select a species for the SDRF template which will contain the basic colummns to fill in for this specific species.
If your species is not in the drop down list, you can always use the default template.""",
species, help="This species selection will impact the default columns present in your SDRF template. You can always add more columns in step *Additional columns*.")
if selected_species != "":
folder_path = os.path.join(local_dir, "templates")
# Load the corresponding CSV file based on the selected species
template_df = pd.read_csv(
f"{folder_path}/sdrf-{selected_species}.tsv",
sep="\t",
)
template_df["comment[modification parameters]"] = np.nan
template_df["comment[fragment mass tolerance]"] = np.nan
template_df["comment[precursor mass tolerance]"] = np.nan
# Ask user to upload filenames of their samples
filenames = []
uploaded_names = st.text_input("Input raw file names as a comma or tab separated list", help="The raw file names will be input in the comment[data file] column and are the basis of your SDRF file. Input maximum 250 raw files")
if uploaded_names is not None:
#if comma separated, split on comma, if tab separated, split on tab
if "," in uploaded_names:
uploaded_names = uploaded_names.split(",")
elif "\t" in uploaded_names:
uploaded_names = uploaded_names.split("\t")
elif " " in uploaded_names:
uploaded_names = uploaded_names.split(" ")
#remove trailing and leading spaces
uploaded_names = [name.strip() for name in uploaded_names]
filenames.append(uploaded_names)
if len(filenames[0]) > 250:
st.error('Too many samples, please upload a maximum of 250 samples')
else:
st.write(f"Added filenames: {filenames[0]}")
## Store filenames in the dataframe
template_df["comment[data file]"] = filenames[0]
st.session_state["template_df"] = template_df
## Show the data in a table
st.write(template_df)
if "template_df" not in st.session_state:
st.session_state["template_df"] = template_df
with st.sidebar:
download = st.download_button("Press to download SDRF file",ParsingModule.convert_df(template_df), "intermediate_SDRF.sdrf.tsv", help="download your SDRF file")
st.write("""Please refer to your data and lesSDRF within your manuscript as follows:
*The experimental metadata has been generated using lesSDRF and is available through ProteomeXchange with the dataset identifier [PXDxxxxxxx]*""")