Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Implemented pipeline_version and pipeline_name query fields #345

Merged
merged 17 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions app/api/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"dataset_uuid",
"dataset_name",
"dataset_portal_uri",
"pipeline_version",
"pipeline_name",
]


Expand Down Expand Up @@ -100,6 +102,8 @@ async def get(
min_num_phenotypic_sessions: int,
assessment: str,
image_modal: str,
pipeline_name: str,
pipeline_version: str,
) -> list[CohortQueryResponse]:
"""
Sends SPARQL queries to the graph API via httpx POST requests for subject-session or dataset metadata
Expand All @@ -125,6 +129,10 @@ async def get(
Non-imaging assessment completed by subjects.
image_modal : str
Imaging modality of subject scans.
pipeline_name : str
Name of pipeline run on subject scans.
pipeline_version : str
Version of pipeline run on subject scans.

Returns
-------
Expand All @@ -142,6 +150,8 @@ async def get(
min_num_imaging_sessions=min_num_imaging_sessions,
assessment=assessment,
image_modal=image_modal,
pipeline_version=pipeline_version,
pipeline_name=pipeline_name,
)
)

Expand All @@ -158,6 +168,7 @@ async def get(

response_obj = []
dataset_cols = ["dataset_uuid", "dataset_name"]
dataset_available_pipeline_info = {}
if not results_df.empty:
for (dataset_uuid, dataset_name), group in results_df.groupby(
by=dataset_cols
Expand Down Expand Up @@ -189,6 +200,50 @@ async def get(
)
)

pipeline_grouped_data = (
group.groupby(
[
"sub_id",
"session_id",
"session_type",
"pipeline_name",
],
dropna=True,
)
.agg(
{
"pipeline_version": lambda x: list(
rmanaem marked this conversation as resolved.
Show resolved Hide resolved
x.dropna().unique()
)
}
)
.reset_index()
)

session_completed_pipeline_data = (
pipeline_grouped_data.groupby(
["sub_id", "session_id", "session_type"]
)
.apply(
lambda x: dict(
zip(x["pipeline_name"], x["pipeline_version"])
)
)
.reset_index(name="completed_pipelines")
)

subject_data = pd.merge(
subject_data.reset_index(drop=True),
session_completed_pipeline_data,
on=["sub_id", "session_id", "session_type"],
how="left",
)

# ensure that for sessions missing completed pipeline info, completed_pipelines is still a dict rather than null/nan
subject_data["completed_pipelines"] = subject_data[
rmanaem marked this conversation as resolved.
Show resolved Hide resolved
"completed_pipelines"
].apply(lambda x: x if isinstance(x, dict) else {})

# TODO: Revisit this as there may be a more elegant solution.
# The following code replaces columns with all NaN values with values of None, to ensure they show up in the final JSON as `null`.
# This is needed as the above .agg() seems to turn NaN into None for object-type columns (which have some non-missing values)
Expand All @@ -204,6 +259,14 @@ async def get(

subject_data = list(subject_data.to_dict("records"))

dataset_available_pipeline_info = (
group.groupby("pipeline_name", dropna=True)[
"pipeline_version"
]
.apply(lambda x: list(x.dropna().unique()))
.to_dict()
)

response_obj.append(
CohortQueryResponse(
dataset_uuid=dataset_uuid,
Expand All @@ -224,6 +287,7 @@ async def get(
group["image_modal"].notna()
].unique()
),
available_pipelines=dataset_available_pipeline_info,
)
)

Expand Down
6 changes: 6 additions & 0 deletions app/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pydantic import BaseModel, constr, root_validator

CONTROLLED_TERM_REGEX = r"^[a-zA-Z]+[:]\S+$"
VERSION_REGEX = r"^([A-Za-z0-9-]+)\.(\d+)\.([A-Za-z0-9-]+)$"


class QueryModel(BaseModel):
Expand All @@ -22,6 +23,9 @@ class QueryModel(BaseModel):
min_num_phenotypic_sessions: int = Query(default=None, ge=0)
assessment: constr(regex=CONTROLLED_TERM_REGEX) = None
image_modal: constr(regex=CONTROLLED_TERM_REGEX) = None
pipeline_name: constr(regex=CONTROLLED_TERM_REGEX) = None
# TODO: Check back if validating using a regex is too restrictive
pipeline_version: constr(regex=VERSION_REGEX) = None

@root_validator()
def check_maxage_ge_minage(cls, values):
Expand Down Expand Up @@ -67,6 +71,7 @@ class SessionResponse(BaseModel):
assessment: list
image_modal: list
session_file_path: Optional[str]
completed_pipelines: dict


class CohortQueryResponse(BaseModel):
Expand All @@ -81,6 +86,7 @@ class CohortQueryResponse(BaseModel):
num_matching_subjects: int
subject_data: Union[list[SessionResponse], str]
image_modals: list
available_pipelines: dict


class DataElementURI(str, Enum):
Expand Down
2 changes: 2 additions & 0 deletions app/api/routers/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ async def get_query(
query.min_num_phenotypic_sessions,
query.assessment,
query.image_modal,
query.pipeline_name,
query.pipeline_version,
)

return response
48 changes: 44 additions & 4 deletions app/api/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
"nidm": "http://purl.org/nidash/nidm#",
"snomed": "http://purl.bioontology.org/ontology/SNOMEDCT/",
"np": "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/",
}

# Store domains in named tuples
Expand All @@ -61,6 +62,8 @@
IS_CONTROL = Domain("subject_group", "nb:isSubjectGroup")
ASSESSMENT = Domain("assessment", "nb:hasAssessment")
IMAGE_MODAL = Domain("image_modal", "nb:hasContrastType")
PIPELINE_NAME = Domain("pipeline_name", "nb:hasPipelineName")
PIPELINE_VERSION = Domain("pipeline_version", "nb:hasPipelineVersion")
PROJECT = Domain("project", "nb:hasSamples")


Expand Down Expand Up @@ -115,6 +118,8 @@ def create_query(
min_num_phenotypic_sessions: Optional[int] = None,
assessment: Optional[str] = None,
image_modal: Optional[str] = None,
pipeline_name: Optional[str] = None,
pipeline_version: Optional[str] = None,
) -> str:
"""
Creates a SPARQL query using a query template and filters it using the input parameters.
Expand All @@ -139,6 +144,10 @@ def create_query(
Non-imaging assessment completed by subjects, by default None.
image_modal : str, optional
Imaging modality of subject scans, by default None.
pipeline_name : str, optional
Name of pipeline run on subject scans, by default None.
pipeline_version : str, optional
Version of pipeline run on subject scans, by default None.

Returns
-------
Expand Down Expand Up @@ -203,13 +212,28 @@ def create_query(
imaging_session_level_filters = ""
if image_modal is not None:
imaging_session_level_filters += (
"\n" + f"FILTER (?{IMAGE_MODAL.var} = {image_modal})."
"\n"
+ f"{create_bound_filter(IMAGE_MODAL.var)} && ?{IMAGE_MODAL.var} = {image_modal})."
)

if pipeline_name is not None:
imaging_session_level_filters += (
"\n"
+ f"{create_bound_filter(PIPELINE_NAME.var)} && (?{PIPELINE_NAME.var} = {pipeline_name})."
)

# In case a user specified the pipeline version but not the name
if pipeline_version is not None:
imaging_session_level_filters += (
"\n"
+ f'{create_bound_filter(PIPELINE_VERSION.var)} && ?{PIPELINE_VERSION.var} = "{pipeline_version}").' # Wrap with quotes to avoid workaround implicit conversion
)

query_string = textwrap.dedent(
f"""
SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions
?session_id ?session_type ?assessment ?image_modal ?session_file_path ?pipeline_version ?pipeline_name
WHERE {{
?dataset_uuid a nb:Dataset;
nb:hasLabel ?dataset_name;
Expand Down Expand Up @@ -244,14 +268,30 @@ def create_query(
{phenotypic_session_level_filters}
}} GROUP BY ?subject
}}

OPTIONAL {{
?session nb:hasCompletedPipeline ?pipeline.
?pipeline nb:hasPipelineVersion ?pipeline_version.
?pipeline nb:hasPipelineName ?pipeline_name.
}}
{{
SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
WHERE {{
?subject a nb:Subject.
OPTIONAL {{
?subject nb:hasSession ?imaging_session.
?imaging_session a nb:ImagingSession;
nb:hasAcquisition/nb:hasContrastType ?image_modal.
?imaging_session a nb:ImagingSession.

OPTIONAL {{
?imaging_session nb:hasAcquisition ?acquisition.
?acquisition nb:hasContrastType ?image_modal.
}}

OPTIONAL {{
?imaging_session nb:hasCompletedPipeline ?pipeline.
?pipeline nb:hasPipelineVersion ?pipeline_version;
nb:hasPipelineName ?pipeline_name.
}}
}}
{imaging_session_level_filters}
}} GROUP BY ?subject
Expand Down
12 changes: 11 additions & 1 deletion docs/default_neurobagel_query.rq
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ PREFIX nidm: <http://purl.org/nidash/nidm#>
PREFIX snomed: <http://purl.bioontology.org/ontology/SNOMEDCT/>

SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path ?pipeline_name ?pipeline_version
WHERE {
?dataset_uuid a nb:Dataset;
nb:hasLabel ?dataset_name;
Expand Down Expand Up @@ -41,6 +41,11 @@ WHERE {

} GROUP BY ?subject
}
OPTIONAL {
?session nb:hasCompletedPipeline ?pipeline.
?pipeline nb:hasPipelineVersion ?pipeline_version.
?pipeline nb:hasPipelineName ?pipeline_name.
}
{
SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
WHERE {
Expand All @@ -50,6 +55,11 @@ WHERE {
?imaging_session a nb:ImagingSession;
nb:hasAcquisition/nb:hasContrastType ?image_modal.
}
OPTIONAL {
?imaging_session nb:hasCompletedPipeline ?pipeline.
?pipeline nb:hasPipelineVersion ?pipeline_version.
?pipeline nb:hasPipelineName ?pipeline_name.
}

} GROUP BY ?subject
}
Expand Down
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def test_data():
"http://purl.org/nidash/nidm#T1Weighted",
"http://purl.org/nidash/nidm#T2Weighted",
],
"available_pipelines": {
"freesurfer": ["7.3.2", "2.8.2", "8.7.0-rc"]
},
},
{
"dataset_uuid": "http://neurobagel.org/vocab/67890",
Expand All @@ -86,6 +89,10 @@ def test_data():
"http://purl.org/nidash/nidm#FlowWeighted",
"http://purl.org/nidash/nidm#T1Weighted",
],
"available_pipelines": {
"freesurfer": ["7.3.2", "2.1.2"],
"fmriprep": ["23.1.3", "22.1.4", "v2.0.1"],
},
},
]

Expand Down Expand Up @@ -178,6 +185,8 @@ async def _mock_get_with_exception(
min_num_phenotypic_sessions,
assessment,
image_modal,
pipeline_version,
pipeline_name,
):
raise request.param

Expand Down Expand Up @@ -206,6 +215,8 @@ async def _mock_get(
min_num_phenotypic_sessions,
assessment,
image_modal,
pipeline_version,
pipeline_name,
):
return request.param

Expand All @@ -226,6 +237,8 @@ async def _mock_successful_get(
min_num_phenotypic_sessions,
assessment,
image_modal,
pipeline_version,
pipeline_name,
):
return test_data

Expand Down
Loading