Skip to content

Commit

Permalink
Improve doc and typing. (#108)
Browse files Browse the repository at this point in the history
* Update module names

* Update doc in fasttext.py

* Update data source URLs

* improve get_and_create_mltb2_data_dir doc

* improve doc of chunk_md

* improve doc of plot module
  • Loading branch information
PhilipMay authored Dec 10, 2023
1 parent e8ac193 commit 924113c
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 29 deletions.
17 changes: 6 additions & 11 deletions mltb2/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Data loading functionality.
"""Data loading module.
Use pip to install the necessary dependencies for this module:
``pip install mltb2[data]``
Expand All @@ -28,8 +28,7 @@ def _load_colon_data() -> pd.DataFrame:
"""Load colon data (not the labels).
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Also see `<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
data as pandas DataFrame
Expand Down Expand Up @@ -60,8 +59,7 @@ def _load_colon_label() -> pd.Series:
"""Load colon label (not the data).
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Also see `<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
labels as pandas Series
Expand Down Expand Up @@ -96,8 +94,7 @@ def load_colon() -> Tuple[pd.Series, pd.DataFrame]:
"""Load colon data.
The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Also see `<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.
Returns:
Tuple containing labels and data.
Expand All @@ -119,8 +116,7 @@ def load_colon() -> Tuple[pd.Series, pd.DataFrame]:
def load_prostate() -> Tuple[pd.Series, pd.DataFrame]:
"""Load prostate data.
The data is loaded and parsed from `prostate data
<https://web.stanford.edu/~hastie/CASI_files/DATA/prostate.html>`_.
The data is loaded and parsed from `<https://web.stanford.edu/~hastie/CASI_files/DATA/prostate.html>`_.
Returns:
Tuple containing labels and data.
Expand Down Expand Up @@ -166,8 +162,7 @@ def load_leukemia_big() -> Tuple[pd.Series, pd.DataFrame]:
"""Load leukemia (big) data.
The data is loaded and parsed from the internet.
Also see `leukemia data
<https://web.stanford.edu/~hastie/CASI_files/DATA/leukemia.html>`_.
Also see `<https://web.stanford.edu/~hastie/CASI_files/DATA/leukemia.html>`_.
Returns:
Tuple containing labels and data.
Expand Down
10 changes: 7 additions & 3 deletions mltb2/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""fastText specific functionality.
"""fastText specific module.
This module is based on `fastText <https://fasttext.cc/docs/en/support.html>`_.
Use pip to install the necessary dependencies for this module:
Expand Down Expand Up @@ -30,7 +30,11 @@ def __post_init__(self):

@staticmethod
def get_model_path_and_download() -> str:
"""Get the model path and download it if needed."""
"""Get the model path and download it if needed.
Returns:
The full path to the downloaded model file.
"""
model_filename = "lid.176.bin"
mltb2_data_home = get_and_create_mltb2_data_dir()
model_full_path = os.path.join(mltb2_data_home, model_filename)
Expand All @@ -47,7 +51,7 @@ def get_model_path_and_download() -> str:

return model_full_path

def __call__(self, text, num_lang: int = 10):
def __call__(self, text: str, num_lang: int = 10):
"""Identify languages of a given text.
Args:
Expand Down
8 changes: 6 additions & 2 deletions mltb2/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""File utils.
"""File utils module.
Use pip to install the necessary dependencies for this module:
``pip install mltb2[files]``
Expand All @@ -20,6 +20,10 @@
def get_and_create_mltb2_data_dir(mltb2_base_data_dir: Optional[str] = None) -> str:
"""Return and create mltb data dir.
Args:
mltb2_base_data_dir: The base data directory. If ``None`` the default
user data directory is used.
Returns:
The directory path.
"""
Expand All @@ -32,7 +36,7 @@ def get_and_create_mltb2_data_dir(mltb2_base_data_dir: Optional[str] = None) ->
return mltb2_data_dir


def fetch_remote_file(dirname, filename, url, sha256_checksum) -> str:
def fetch_remote_file(dirname, filename, url: str, sha256_checksum: str) -> str:
"""Fetch a file from a remote URL.
Args:
Expand Down
25 changes: 21 additions & 4 deletions mltb2/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Markdown specific functionality.
"""Markdown specific module.
Use pip to install the necessary dependencies for this module:
``pip install mltb2[md]``
Expand All @@ -21,7 +21,14 @@


def _chunk_md_by_headline(md_text: str) -> List[str]:
"""Chunk Markdown by headlines."""
"""Chunk Markdown by headlines.
Args:
md_text: The Markdown text to be chunked.
Returns:
The list of Markdown chunks.
"""
positions: List[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)]

# extend positions
Expand All @@ -34,14 +41,24 @@ def _chunk_md_by_headline(md_text: str) -> List[str]:


def chunk_md(md_text: str) -> List[str]:
"""Chunk Markdown by headlines and merge isolated headlines."""
"""Chunk Markdown by headlines and merge isolated headlines.
Merges isolated headlines with their corresponding subsequent paragraphs.
Headings isolated at the end of ``md_text`` (headings without content) are removed in this process.
Args:
md_text: The Markdown text to be chunked.
Returns:
The list of Markdown chunks.
"""
md_chunks = _chunk_md_by_headline(md_text)

merged_chunks = []
temp_merged_chunk = []
for chunk in md_chunks:
temp_merged_chunk.append(chunk)
if "\n" in chunk: # content found
if "\n" in chunk: # content chunk found
joined_content = "\n\n".join(temp_merged_chunk)
merged_chunks.append(joined_content)
temp_merged_chunk = []
Expand Down
2 changes: 1 addition & 1 deletion mltb2/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""OpenAI specific functionality.
"""OpenAI specific module.
Use pip to install the necessary dependencies for this module:
``pip install mltb2[openai]``
Expand Down
2 changes: 1 addition & 1 deletion mltb2/optuna.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Optuna specific functionality.
"""Optuna specific module.
This module is based on `Optuna <https://optuna.readthedocs.io/en/stable/>`_.
Use pip to install the necessary dependencies for this module:
Expand Down
36 changes: 33 additions & 3 deletions mltb2/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""A collection of plot tools.
"""Plot tools module.
This module is based on `Matplotlib <https://matplotlib.org/>`_.
Use pip to install the necessary dependencies for this module:
Expand Down Expand Up @@ -78,10 +78,26 @@ def twin_axes_timeseries_plot(
fig.tight_layout()


def boxplot(values, labels=None, title=None, xlabel=None, ylabel=None, vert=True):
def boxplot(
values,
labels=None,
title: Optional[str] = None,
xlabel: Optional[str] = None,
ylabel: Optional[str] = None,
vert: bool = True,
):
"""Prints one or more boxplots in a single diagram.
This function does not call `matplotlib.pyplot.plot()`.
Args:
values: Values for the boxplot(s).
labels: Labels for the boxplot(s).
title: Title of the plot.
xlabel: Label for the x-axis.
ylabel: Label for the y-axis.
vert: If ``True`` (default), makes the boxes vertical.
If ``False``, makes horizontal boxes.
"""
_, ax = plt.subplots()

Expand All @@ -106,10 +122,24 @@ def boxplot(values, labels=None, title=None, xlabel=None, ylabel=None, vert=True
plt.xticks(rotation=90)


def boxplot_dict(values_dict, title=None, xlabel=None, ylabel=None, vert=True):
def boxplot_dict(
values_dict,
title: Optional[str] = None,
xlabel: Optional[str] = None,
ylabel: Optional[str] = None,
vert: bool = True,
):
"""Create boxplot form dictionary.
This function does not call `matplotlib.pyplot.plot()`.
Args:
values_dict: Dictionary with values for the boxplot(s).
title: Title of the plot.
xlabel: Label for the x-axis.
ylabel: Label for the y-axis.
vert: If ``True`` (default), makes the boxes vertical.
If ``False``, makes horizontal boxes.
"""
values = []
labels = []
Expand Down
2 changes: 1 addition & 1 deletion mltb2/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""SoMaJo specific functionality.
"""SoMaJo specific module.
This module is based on `SoMaJo <https://github.com/tsproisl/SoMaJo>`_.
Use pip to install the necessary dependencies for this module:
Expand Down
2 changes: 1 addition & 1 deletion mltb2/somajo_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Hugging Face Transformers and SoMaJo specific functionality.
"""Hugging Face Transformers and SoMaJo specific module.
This module is based on
`Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_ and
Expand Down
2 changes: 1 addition & 1 deletion mltb2/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Text specific functionality."""
"""Text specific module."""

from typing import Dict, Final, Tuple

Expand Down
2 changes: 1 addition & 1 deletion mltb2/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Hugging Face Transformers specific functionality.
"""Hugging Face Transformers specific module.
This module is based on
`Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
Expand Down

0 comments on commit 924113c

Please sign in to comment.