Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for matching files #158

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 39 additions & 35 deletions src/tcutility/pathfunc.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os
import re
from typing import Dict, List

from tcutility import results
from typing import List, Dict

j = os.path.join


def split_all(path: str) -> List[str]:
"""
'''
Split a path into all of its parts.

Args:
Expand All @@ -22,7 +21,7 @@ def split_all(path: str) -> List[str]:

>>> split_all('a/b/c/d')
['a', 'b', 'c', 'd']
"""
'''
parts = []
while True:
a, b = os.path.split(path)
Expand All @@ -34,7 +33,7 @@ def split_all(path: str) -> List[str]:


def get_subdirectories(root: str, include_intermediates: bool = False) -> List[str]:
"""
'''
Get all sub-directories of a root directory.

Args:
Expand All @@ -55,33 +54,33 @@ def get_subdirectories(root: str, include_intermediates: bool = False) -> List[s
| |- subsubdir_c
|- subdir_b
|- subdir_c

Then we get the following outputs.

.. tabs::

.. group-tab:: Including intermediates

.. code-block:: python

>>> get_subdirectories('root', include_intermediates=True)
['root',
'root/subdir_a',
['root',
'root/subdir_a',
'root/subdir_a/subsubdir_b',
'root/subdir_a/subsubdir_c',
'root/subdir_b',
'root/subdir_a/subsubdir_c',
'root/subdir_b',
'root/subdir_c']

.. group-tab:: Excluding intermediates

.. code-block:: python

>>> get_subdirectories('root', include_intermediates=False)
['root/subdir_a/subsubdir_b',
'root/subdir_a/subsubdir_c',
'root/subdir_b',
['root/subdir_a/subsubdir_b',
'root/subdir_a/subsubdir_c',
'root/subdir_b',
'root/subdir_c']
"""
'''
dirs = [root]
subdirs = set()

Expand All @@ -101,8 +100,8 @@ def get_subdirectories(root: str, include_intermediates: bool = False) -> List[s
return subdirs


def match(root: str, pattern: str) -> Dict[str, dict]:
"""
def match(root: str, pattern: str, match_files: bool = True) -> Dict[str, dict]:
'''
Find and return information about subdirectories of a root that match a given pattern.

Args:
Expand All @@ -111,16 +110,17 @@ def match(root: str, pattern: str) -> Dict[str, dict]:
It should look similar to a format string, without the ``f`` in front of the string.
Inside curly braces you can put a variable name, which you can later extract from the results.
Anything inside curly braces will be matched to word characters (``[a-zA-Z0-9_-]``) including dashes and underscores.
match_files: whether to not only match subdirectories but also files inside subdirectories. Defaults to True.

Returns:
| A |Result| object containing the matched directories as keys and information (also |Result| object) about those matches as the values.
Each information dictionary contains the variables given in the pattern.
| E.g. using a pattern such as ``{a}/{b}/{c}`` will populate the ``info.a``, ``info.b`` and ``info.c`` keys of the info |Result| object.

Example:
Given a file-structure as follows:

.. code-block::
.. code-block::

root
|- NH3-BH3
Expand All @@ -141,13 +141,13 @@ def match(root: str, pattern: str) -> Dict[str, dict]:
We can run the following scripts to match the subdirectories.

.. code-block:: python

from tcutility import log
# get the matches, we want to extract the system name (NH3-BH3 or SN2)
# get the matches, we want to extract the system name (NH3-BH3 or SN2)
# and the functional and basis-set
# we don't want the subdirectories
matches = match('root', '{system}/{functional}_{basis_set}')

# print the matches as a table
rows = []
for d, info in matches.items():
Expand All @@ -157,31 +157,35 @@ def match(root: str, pattern: str) -> Dict[str, dict]:

which prints

.. code-block::
.. code-block::

[2024/01/17 14:39:08] Directory System Functional Basis-Set
[2024/01/17 14:39:08] ───────────────────────────────────────────────────────────
[2024/01/17 14:39:08] root/SN2/M06-2X_TZ2P SN2 M06-2X TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/BLYP_TZ2P NH3-BH3 BLYP TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/M06-2X_TZ2P NH3-BH3 M06-2X TZ2P
[2024/01/17 14:39:08] root/SN2/BLYP_TZ2P SN2 BLYP TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/BLYP_QZ4P NH3-BH3 BLYP QZ4P


"""
[2024/01/17 14:39:08] root/SN2/M06-2X_TZ2P SN2 M06-2X TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/BLYP_TZ2P NH3-BH3 BLYP TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/M06-2X_TZ2P NH3-BH3 M06-2X TZ2P
[2024/01/17 14:39:08] root/SN2/BLYP_TZ2P SN2 BLYP TZ2P
[2024/01/17 14:39:08] root/NH3-BH3/BLYP_QZ4P NH3-BH3 BLYP QZ4P
'''
# get the number and names of substitutions in the given pattern
substitutions = re.findall(r"{(\w+[+*?]?)}", pattern)
substitutions = re.findall(r'{(\w+[+*?]?)}', pattern)
# the pattern should resolve to words and may contain - and _
# replace them here
for sub in substitutions:
quantifier = sub[-1] if sub[-1] in "+*?" else "+"
pattern = pattern.replace("{" + sub + "}", f"([a-zA-Z0-9_-]{quantifier})")
quantifier = sub[-1] if sub[-1] in '+*?' else '+'
pattern = pattern.replace('{' + sub + '}', f'([a-zA-Z0-9._-]{quantifier})')

ret = results.Result()
# root dir can be any level deep. We should count how many directories are in root
root_length = len(split_all(root))
# get all subdirectories first, we can loop through them later
subdirs = get_subdirectories(root, include_intermediates=True)
if match_files:
_subdirs = []
for subdir in subdirs:
_subdirs.extend([j(subdir, file) for file in os.listdir(subdir)])
subdirs = _subdirs

# remove the root from the subdirectories. We cannot use str.removeprefix because it was added in python 3.9
subdirs = [j(*split_all(subdir)[root_length:]) for subdir in subdirs if len(split_all(subdir)[root_length:]) > 0]
for subdir in subdirs:
Expand All @@ -192,6 +196,6 @@ def match(root: str, pattern: str) -> Dict[str, dict]:

p = j(root, subdir)
# get the group data and add it to the return dictionary. We skip the first group because it is the full directory path
ret[p] = results.Result(directory=p, **{substitutions[i]: match.group(i + 1) for i in range(len(substitutions))})
ret[p] = results.Result(directory=p, **{substitutions[i]: match.group(i+1) for i in range(len(substitutions))})

return ret
Loading