Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update URL handling functions #152

Merged
merged 5 commits into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/requirements.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ the requirement falls in, the type of requirement, and whether
' docformatter_10.1.1', ' Shall not wrap lists or syntax directive statements', ' Derived', ' Shall', ' Yes'
' docformatter_10.1.1.1', ' Should allow wrapping of lists and syntax directive statements.', ' Stakeholder', ' Should', ' Yes [*PR #5*, *PR #93*]'
' docformatter_10.1.2', ' Should allow/disallow wrapping of one-line docstrings.', ' Derived', ' Should', ' No'
' docformatter_10.1.3', ' Shall not wrap links that exceed the wrap length.', Derived', ' Shall', ' Yes [*PR #114*]'
' docformatter_10.1.3', ' Shall not wrap links that exceed the wrap length.', ' Derived', ' Shall', ' Yes [*PR #114*]'
' docformatter_10.1.3.1', ' Shall maintain in-line links on one line even if the resulting line exceeds wrap length.', ' Derived', ' Shall', ' Yes [*PR #152*]'
' docformatter_10.2', ' Should format docstrings using NumPy style.', ' Style', ' Should', ' No'
' docformatter_10.3', ' Should format docstrings using Google style.', ' Style', ' Should', ' No'
' docformatter_10.4', ' Should format docstrings using Sphinx style.',' Style', ' Should', ' No'
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ deps =
untokenize
commands =
docformatter --recursive {toxinidir}/src/docformatter
pycodestyle {toxinidir}/src/docformatter
pycodestyle --ignore=E203,W503,W504 {toxinidir}/src/docformatter
pydocstyle {toxinidir}/src/docformatter
pylint --rcfile={toxinidir}/pyproject.toml {toxinidir}/src/docformatter
rstcheck --report-level=1 {toxinidir}/README.rst
Expand Down
257 changes: 121 additions & 136 deletions src/docformatter/syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,83 @@
"""This module provides docformatter's Syntaxor class."""

# Standard Library Imports
import contextlib
import re
import textwrap
from typing import List

from typing import Iterable, List, Tuple, Union

# These are the URL pattern to look for when finding links and is based on the
# table at <https://en.wikipedia.org/wiki/List_of_URI_schemes>
URL_PATTERNS = (
"afp|"
"apt|"
"bitcoin|"
"chrome|"
"cvs|"
"dav|"
"dns|"
"file|"
"finger|"
"fish|"
"ftp|"
"ftps|"
"git|"
"http|"
"https|"
"imap|"
"ipp|"
"ipps|"
"irc|"
"irc6|"
"ircs|"
"jar|"
"ldap|"
"ldaps|"
"mailto|"
"news|"
"nfs|"
"nntp|"
"pop|"
"rsync|"
"s3|"
"sftp|"
"shttp|"
"sip|"
"sips|"
"smb|"
"sms|"
"snmp|"
"ssh|"
"svn|"
"telnet|"
"vnc|"
"xmpp|"
"xri"
)

# This is the regex used to find URL links:
#
# (`[\w.:]+|\.\._?[\w:]+)? is used to find in-line links that should remain
# on a single line even if it exceeds the wrap length.
# `[\w.:]+ matches the character ` followed by any number of letters,
# periods, spaces, or colons.
# \.\._?[\w:]+ matches the pattern .. followed by zero or one underscore,
# then any number of letters, periods, spaces, or colons.
# ? matches the previous pattern between zero or one times.
# <?({URL_PATTERNS}):(//)?(\S*)>? is used to find the actual link.
# < ? matches the character < between zero and one times.
# ({URL_PATTERNS}):(//)? matches one of the strings in the variable
# URL_PATTERNS, followed by a colon and two forward slashes zero or one time.
# (\S*) matches any non-whitespace character between zero and unlimited
# times.
# >? matches the character > between zero and one times.
URL_REGEX = rf"(`[\w. :]+|\.\. _?[\w :]+)?<?({URL_PATTERNS}):(//)?(\S*)>?"
HEURISTIC_MIN_LIST_ASPECT_RATIO = 0.4


def description_to_list(
text: str, indentation: str, wrap_length: int
text: str,
indentation: str,
wrap_length: int,
) -> List[str]:
"""Convert the description to a list of wrap length lines.

Expand Down Expand Up @@ -75,15 +142,34 @@ def description_to_list(
lines.extend(_text)
else:
lines.append("")

return lines


def do_preserve_links(
def do_find_links(text: str) -> List[Tuple[int, int]]:
r"""Determine if docstring contains any links.

Parameters
----------
text: str
the docstring description to check for a link patterns.

Returns
-------
url_index: list
a list of tuples with each tuple containing the starting and ending
position of each URL found in the passed description.
"""
_url_iter = re.finditer(URL_REGEX, text)
return [(_url.start(0), _url.end(0)) for _url in _url_iter]


def do_split_description(
text: str,
indentation: str,
wrap_length: int,
) -> List[str]:
"""Rebuild links in docstring.
) -> Union[List[str], Iterable]:
"""Split the description into a list of lines.

Parameters
----------
Expand All @@ -101,132 +187,34 @@ def do_preserve_links(
A list containing each line of the description with any links put
back together.
"""
lines = description_to_list(text, indentation, wrap_length)

# There is nothing to do if the input wasn't wrapped.
if len(lines) < 2:
return lines

url = is_some_sort_of_link(lines)

if url != "":
url_idx = lines.index(url)

# Is this an in-line link (i.e., enclosed in <>)? We want to keep
# the '<' and '>' part of the link.
if re.search(r"<", url):
if len(url.split(sep="<")[0].strip()) > 0:
lines[url_idx] = (
f"{indentation}" + url.split(sep="<")[0].strip()
)

url = f"{indentation}<" + url.split(sep="<")[1]
if len(url.split(sep=">")) < 2:
url = url + lines[url_idx + 1].strip()
lines[url_idx + 1] = url

# Is this a link target definition (i.e., .. a link: https://)? We
# want to keep the .. a link: on the same line as the url.
elif re.search(r"(\.\. )", url):
url = url + lines[url_idx + 1].strip()
lines[url_idx] = url
lines.pop(url_idx + 1)

# Is this a simple link (i.e., just a link in the text) that should
# be unwrapped? We want to break the url out from the rest of the
# text.
elif len(lines[url_idx]) >= wrap_length:
lines[url_idx] = (
f"{indentation}" + url.strip().split(sep=" ")[0].strip()
)
url = f"{indentation}" + url.strip().split(sep=" ")[1].strip()
url = url + lines[url_idx + 1].strip().split(sep=" ")[0].strip()
lines.append(
indentation
+ " ".join(lines[url_idx + 1].strip().split(sep=" ")[1:])
)
lines[url_idx + 1] = url

with contextlib.suppress(IndexError):
if lines[url_idx + 2].strip() in [".", "?", "!", ";"] or re.search(
r">", lines[url_idx + 2]
# Check if the description contains any URLs.
_url_idx = do_find_links(text)
if _url_idx:
_lines = []
_text_idx = 0
for _idx in _url_idx:
# If the text including the URL is longer than the wrap length,
# we need to split the description before the URL, wrap the pre-URL
# text, and add the URL as a separate line.
if len(text[_text_idx : _idx[1]]) > (
wrap_length - len(indentation)
):
url = url + lines[url_idx + 2].strip()
lines[url_idx + 1] = url
lines.pop(url_idx + 2)

return lines


def is_some_sort_of_link(lines: List[str]) -> str:
"""Determine if docstring line contains a link.

URL patterns based on table at
<https://en.wikipedia.org/wiki/List_of_URI_schemes>

Parameters
----------
lines: str
the list of docstring lines to check for a link pattern.
# Wrap everything in the description before the first URL.
_lines.extend(
description_to_list(
text[_text_idx : _idx[0]], indentation, wrap_length
)
)
# Add the URL.
_lines.append(f"{indentation}{text[_idx[0]:_idx[1]].strip()}")
_text_idx = _idx[1]

Returns
-------
url: str
the line with the url pattern.
"""
url_patterns = (
"("
"afp://|"
"apt:|"
"bitcoin:|"
"chrome://|"
"cvs://|"
"dav://|"
"dns:|"
"file://|"
"finger://|"
"fish://|"
"ftp://|"
"ftps://|"
"git://|"
"http://|"
"https://|"
"imap://|"
"ipp://|"
"ipps://|"
"irc://|"
"irc6://|"
"ircs://|"
"jar:|"
"ldap://|"
"ldaps://|"
"mailto:|"
"news:|"
"nfs://|"
"nntp://|"
"pop://|"
"rsync://|"
"s3://|"
"sftp://|"
"shttp://|"
"sip:|"
"sips:|"
"smb://|"
"sms:|"
"snmp://|"
"ssh://|"
"svn://|"
"telnet://|"
"vnc://|"
"xmpp:|"
"xri://"
")"
)
# Finally, add everything after the last URL.
_lines.append(f"{indentation}{text[_text_idx:].strip()}")

return next(
(line for line in lines if re.search(rf"<?{url_patterns}", line)),
"",
)
return _lines
else:
return description_to_list(text, indentation, wrap_length)


# pylint: disable=line-too-long
Expand Down Expand Up @@ -262,8 +250,6 @@ def is_some_sort_of_list(text, strict) -> bool:

return any(
(
# re.match(r"\s*$", line)
# or
# "1. item"
re.match(r"\s*\d\.", line)
or
Expand All @@ -289,8 +275,7 @@ def is_some_sort_of_list(text, strict) -> bool:
def is_some_sort_of_code(text: str) -> bool:
"""Return True if text looks like code."""
return any(
len(word) > 50
and not re.match(r"<{0,1}(http:|https:|ftp:|sftp:)", word)
len(word) > 50 and not re.match(URL_REGEX, word)
for word in text.split()
)

Expand Down Expand Up @@ -385,6 +370,6 @@ def wrap_description(text, indentation, wrap_length, force_wrap, strict):
):
return text

text = do_preserve_links(text, indentation, wrap_length)
lines = do_split_description(text, indentation, wrap_length)

return indentation + "\n".join(text).strip()
return indentation + "\n".join(lines).strip()
Loading