Skip to content

Commit

Permalink
Add data tool for colon data. (#100)
Browse files Browse the repository at this point in the history
* base functions

* add full function

* add doc

* add api doc
  • Loading branch information
PhilipMay authored Dec 8, 2023
1 parent 7f9cd4c commit f1c3962
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@ test:
sphinx:
poetry run make -C docs clean html

open-sphinx:
open docs/build/html/index.html

install:
poetry lock && poetry install --all-extras
6 changes: 6 additions & 0 deletions docs/source/api-reference/data.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.. _data_code_doc:

:mod:`~mltb2.data`
==================

.. automodule:: mltb2.data
111 changes: 111 additions & 0 deletions mltb2/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) 2020 - 2023 Philip May
# Copyright (c) 2021 Sigrun May, Helmholtz-Zentrum für Infektionsforschung GmbH (HZI)
# Copyright (c) 2021 Sigrun May, Ostfalia Hochschule für angewandte Wissenschaften
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Data loading functionality."""

import os
from hashlib import sha256
from typing import Tuple

import joblib
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from mltb2.files import get_and_create_mltb2_data_dir


def _load_colon_data() -> pd.DataFrame:
"""Load colon data (not the labels).
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
data as pandas DataFrame
"""
# download data file
url = "http://genomics-pubs.princeton.edu/oncology/affydata/I2000.html"
page = requests.get(url, timeout=10)

# check checksum of data file
page_hash = sha256(page.content).hexdigest()
assert page_hash == "74cc7b47d40a0fbca8dde05f42bcb799b7babad29ea634139a221bb4386b1c3d", page_hash

soup = BeautifulSoup(page.content, "html.parser")
page_text = soup.get_text()

page_text_lines = page_text.splitlines()
assert len(page_text_lines) >= 2000
page_text_lines = [[float(s) for s in line.split()] for line in page_text_lines if len(line) > 20]
assert len(page_text_lines) == 2000
assert len(page_text_lines[0]) == 62

data = np.array(page_text_lines).T
data_df = pd.DataFrame(data)
return data_df


def _load_colon_label() -> pd.Series:
"""Load colon label (not the data).
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
labels as pandas Series
"""
# download data file
url = "http://genomics-pubs.princeton.edu/oncology/affydata/tissues.html"
page = requests.get(url, timeout=10)

# check checksum of data file
page_hash = sha256(page.content).hexdigest()
assert page_hash == "0c5b377c5dd5544d015bff479a4260d5ccf0bcf98657f600a1d37e34193e0f52", page_hash

soup = BeautifulSoup(page.content, "html.parser")
page_text = soup.get_text()
page_text_lines = page_text.splitlines()

label = []

for line in page_text_lines:
try:
i = int(line)
label.append(0 if i > 0 else 1)
except ValueError:
pass # we ignore this

assert len(label) == 62
label_series = pd.Series(label)
return label_series


def load_colon() -> Tuple[pd.Series, pd.DataFrame]:
"""Load colon data.
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
Tuple containing labels and data.
"""
filename = "colon.pkl.gz"
mltb2_data_home = get_and_create_mltb2_data_dir()
full_path = os.path.join(mltb2_data_home, filename)
if not os.path.exists(full_path):
data_df = _load_colon_data()
label_series = _load_colon_label()
result = (label_series, data_df)
joblib.dump(result, full_path, compress=("gzip", 3))
else:
result = joblib.load(full_path)

return result
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,14 @@ tiktoken = {version = "*", optional = true}
safetensors = {version = "!=0.3.2", optional = true} # version 0.3.2 has poetry issues
openai = {version = "^0", optional = true}
pyyaml = {version = "*", optional = true}
pandas = {version = "*", optional = true}
beautifulsoup4 = {version = "*", optional = true}
joblib = {version = "*", optional = true}

[tool.poetry.extras]
files = ["platformdirs", "scikit-learn"]
fasttext = ["fasttext-wheel"]
data = ["platformdirs", "scikit-learn", "pandas", "beautifulsoup4", "joblib"]
optuna = ["optuna"]
plot = ["matplotlib"]
somajo = ["SoMaJo"]
Expand Down Expand Up @@ -131,6 +135,8 @@ ignore = [
"PLR0913", # Too many arguments to function call ({c_args} > {max_args})
"S106", # Possible hardcoded password assigned to argument: "{}"
"COM812", # Trailing comma missing
"S101", # Use of `assert` detected
"PLR2004", # Magic value used in comparison,
]

[tool.ruff.per-file-ignores]
Expand Down
32 changes: 32 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

import pandas as pd

from mltb2.data import _load_colon_data, _load_colon_label, load_colon


def test_load_colon_data():
result = _load_colon_data()
assert result is not None
assert isinstance(result, pd.DataFrame)
assert result.shape == (62, 2000)


def test_load_colon_label():
result = _load_colon_label()
assert result is not None
assert isinstance(result, pd.Series)
assert len(result) == 62


def test_load_colon():
result = load_colon()
assert result is not None
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], pd.Series)
assert isinstance(result[1], pd.DataFrame)
assert result[0].shape == (62,)
assert result[1].shape == (62, 2000)

0 comments on commit f1c3962

Please sign in to comment.