Skip to content

Commit

Permalink
feat(openchallenges): pull data from OC Data google sheet (Sage-Bione…
Browse files Browse the repository at this point in the history
  • Loading branch information
vpchung authored Jan 15, 2025
1 parent b1484b7 commit c02e5df
Show file tree
Hide file tree
Showing 10 changed files with 747 additions and 153 deletions.
11 changes: 11 additions & 0 deletions apps/openchallenges/data-lambda/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
TYPE="service_account"
PROJECT_ID="UPDATE_ME"
PRIVATE_KEY_ID="UPDATE_ME"
PRIVATE_KEY="UPDATE_ME"
CLIENT_EMAIL="UPDATE_ME"
CLIENT_ID="UPDATE_ME"
AUTH_URI="https://accounts.google.com/o/oauth2/auth"
TOKEN_URI="https://oauth2.googleapis.com/token"
AUTH_PROVIDER_X509_CERT_URL="https://www.googleapis.com/oauth2/v1/certs"
CLIENT_X509_CERT_URL="UPDATE_ME"
UNIVERSE_DOMAIN="googleapis.com"
2 changes: 1 addition & 1 deletion apps/openchallenges/data-lambda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN poetry export --without-hashes --format=requirements.txt > requirements.txt
FROM public.ecr.aws/lambda/python:3.13

COPY --from=builder /app/requirements.txt ${LAMBDA_TASK_ROOT}/
COPY sandbox_lambda_python/app.py ${LAMBDA_TASK_ROOT}/
COPY openchallenges_data_lambda/app.py ${LAMBDA_TASK_ROOT}/

RUN python3.13 -m pip install --no-cache-dir -r requirements.txt -t .

Expand Down
277 changes: 277 additions & 0 deletions apps/openchallenges/data-lambda/openchallenges_data_lambda/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
import os
import json

import gspread
import numpy as np
import pandas as pd


GOOGLE_SHEET_CREDENTIALS_FILE = "service_account.json"
GOOGLE_SHEET_TITLE = "OpenChallenges Data"


def lambda_handler(event, context):
"""Sample pure Lambda function
Parameters
----------
event: dict, required
API Gateway Lambda Proxy Input Format
Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format
context: object, required
Lambda Context runtime methods and attributes
Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html
Returns
------
API Gateway Lambda Proxy Output Format: dict
Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html
"""

write_credentials_file(GOOGLE_SHEET_CREDENTIALS_FILE)

try:
google_client = gspread.service_account(filename=GOOGLE_SHEET_CREDENTIALS_FILE)
except Exception as err:
message = "Private key not found in the credentials file. Please try again."
else:
try:
wks = google_client.open(GOOGLE_SHEET_TITLE)

platforms = get_platform_data(wks)
print(platforms.head())

roles = get_roles(wks)
print(roles.head())

categories = get_challenge_categories(wks)
print(categories.head())

organizations = get_organization_data(wks)
print(organizations.head())

edam_data_annotations = get_edam_annotations(wks)
print(edam_data_annotations.head())

challenges, incentives, sub_types = get_challenge_data(wks)
print(challenges.head())
print(incentives.head())
print(sub_types.head())

message = "Data successfully pulled from OC Data google sheet."

except Exception as err:
message = f"Something went wrong with pulling the data: {err}."

data = {"message": message}
return {
"statusCode": 200,
"body": json.dumps(data),
}


def write_credentials_file(output_json):
"""Write credentials JSON file for Google Sheets API authentication."""
with open(output_json, "w") as out:
credentials = {
"type": os.getenv("TYPE"),
"project_id": os.getenv("PROJECT_ID"),
"private_key_id": os.getenv("PRIVATE_KEY_ID"),
"private_key": os.getenv("PRIVATE_KEY").encode().decode("unicode_escape"),
"client_email": os.getenv("CLIENT_EMAIL"),
"client_id": os.getenv("CLIENT_ID"),
"auth_uri": os.getenv("AUTH_URI"),
"token_uri": os.getenv("TOKEN_URI"),
"auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
"client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
"universe_domain": os.getenv("UNIVERSE_DOMAIN"),
}
out.write(json.dumps(credentials))


def get_challenge_data(wks, sheet_name="challenges"):
"""Get challenges data and clean up as needed.
Output:
- challenges
- challenge incentives
- challenge submission types
"""
df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
df.loc[df._platform == "Other", "platform"] = "\\N"

challenges = df[
[
"id",
"slug",
"name",
"headline",
"description",
"avatar_url",
"website_url",
"status",
"platform",
"doi",
"start_date",
"end_date",
"operation_id",
"created_at",
"updated_at",
]
]
challenges = (
challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
challenges["headline"] = (
challenges["headline"]
.astype(str)
.apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
)
challenges["description"] = (
challenges["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
challenges.loc[challenges.operation_id == "", "operation_id"] = "\\N"

incentives = pd.concat(
[
df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="monetary"
),
df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="publication"
),
df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="speaking_engagement"
),
df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="other"
),
]
).rename(columns={"id": "challenge_id"})
incentives["incentives"] = pd.Categorical(
incentives["incentives"],
categories=["monetary", "publication", "speaking_engagement", "other"],
)
incentives = incentives.sort_values(["challenge_id", "incentives"])
incentives.index = np.arange(1, len(incentives) + 1)

sub_types = pd.concat(
[
df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="prediction_file"
),
df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="container_image"
),
df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="notebook"
),
df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="mlcube"
),
df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="other"
),
]
).rename(columns={"id": "challenge_id"})
sub_types["submission_types"] = pd.Categorical(
sub_types["submission_types"],
categories=[
"prediction_file",
"container_image",
"notebook",
"mlcube",
"other",
],
)
sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
sub_types.index = np.arange(1, len(sub_types) + 1)

return (
challenges,
incentives[["incentives", "challenge_id", "created_at"]],
sub_types[["submission_types", "challenge_id", "created_at"]],
)


def get_challenge_categories(wks, sheet_name="challenge_category"):
"""Get challenge categories."""
return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
["id", "challenge_id", "category"]
]


def get_platform_data(wks, sheet_name="platforms"):
"""Get platform data and clean up as needed."""
platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
return platforms[platforms._public == "TRUE"][
["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
]


def get_organization_data(wks, sheet_name="organizations"):
"""Get organization data and clean up as needed."""
organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
organizations = organizations[organizations._public == "TRUE"][
[
"id",
"name",
"login",
"avatar_url",
"website_url",
"description",
"challenge_count",
"created_at",
"updated_at",
"acronym",
]
]
organizations = (
organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
organizations["description"] = (
organizations["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
return organizations


def get_roles(wks, sheet_name="contribution_role"):
"""Get data on organization's role(s) in challenges."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_organization"], axis=1)
)


def get_edam_annotations(wks, sheet_name="challenge_data"):
"""Get data on challenge's EDAM annotations."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_edam_name"], axis=1)
)


if __name__ == "__main__":
lambda_handler({}, "")
Loading

0 comments on commit c02e5df

Please sign in to comment.