"""
Capture parts from strings using a convenient glob-like syntax.
"""
import re
from dataclasses import dataclass
from glob import glob
from typing import Dict
from typing import List
from typing import Optional
from .value import Value
from .value import value_as_list
__all__ = ["captures", "globs", "Capture", "capture2glob", "capture2re"]
[docs]@dataclass
class Capture:
"""
Capture the results of successfully matching a string (typically, a path name) with a capture
pattern.
A capture pattern is similar to a ``glob`` pattern. However, all wildcards must be specified
inside ``{...}`` as follows:
* You need to escape the ``{`` character as ``{{`` and the ``}`` character as ``}}``.
* ``{*name}`` has the same effect as ``*``. The matching substring will be captured using the
key ``name``. For example, ``foo.{*suffix}`` will capture the file suffix.
* If ``name`` starts with ``_`` then the matching substring will be discarded instead of being
captured. For example, if you don't want to capture the suffix, write ``foo.{*_}`` instead of
``foo.{*suffix}``.
* If ``name`` is followed by ``:``, it must be followed by a glob pattern. That is, ``{*name}``
is a shorthand for ``{*name:*}``. For example ``foo.{*suffix:[0-9]}`` will capture a single
decimal digit suffix.
* ``{**name}`` is shorthand for ``{*name:**}``. In this case you may not use ``:`` to specify
a glob pattern. For example, ``foo/{**dir}/bar`` will capture all the (possibly empty) paths
from ``foo`` to nested ``bar`` files.
.. note::
Use ``/{**name}/`` instead of ``/{*name:**}/`` as the shorthand is given special treatment
allow for capturing an empty sub-directory (that is, match a single ``/``).
The captured named values are available as members of the object, that is, write
``capture.foo`` to access the value of a captured ``{*foo}``.
"""
def __init__(self, **kwargs: str) -> None:
self.__dict__.update(kwargs)
[docs]def captures(pattern: str, values: Value, *, must_match: bool = False, name: str = "path") -> List[Capture]:
"""
Given a capture ``pattern``, return all the :py:class:`Capture` results of applying it to each of the (sorted,
unique, non-``None``) ``values``. If ``must_match``, all the values must match the pattern. Otherwise, only captures
of matching values are returned.
By default, the complete matched string is made available in a data member ``path`` (as this is typically used to
parse disk file paths). You can override this by specifying a different ``name``.
See :py:class:`Capture` for the description of the capture pattern.
"""
results: List[Capture] = []
regexp = capture2re(pattern)
for value in sorted(set(value_as_list(values))):
parts = _capture_string_parts(regexp, value)
if parts is not None:
parts[name] = value
results.append(Capture(**parts))
elif must_match:
raise ValueError(f"the value: {value} does not match the pattern: {pattern}")
return results
[docs]def globs(pattern: str) -> List[Capture]:
"""
Given a capture ``pattern``, return all the :py:class:`Capture` results of applying it to the
results of a ``glob`` of the equivalent pattern.
See :py:class:`Capture` for the description of the capture pattern.
"""
return captures(pattern, glob(capture2glob(pattern)), must_match=True)
def _capture_string_parts(regexp: re.Pattern, string: str) -> Optional[Dict[str, str]]:
match = re.fullmatch(regexp, string)
if not match:
return None
values = match.groupdict()
for name, value in values.items():
if name and name[0] != "_":
values[name] = str(value or "")
return values
[docs]def capture2re(capture: str) -> re.Pattern: # pylint: disable=too-many-statements
"""
Convert a capture pattern to the equivalent ``re.Pattern``.
"""
index = 0
size = len(capture)
results: List[str] = []
def _is_next(expected: str) -> bool:
nonlocal capture, index, size
return index < size and capture[index] == expected
def _invalid(reason: str = "") -> None:
nonlocal capture, index
raise ValueError(f'Invalid capture pattern:\n{capture}\n{index * " "}^ {reason}')
def _expect_close() -> None:
if not _is_next("}"):
_invalid("missing }")
nonlocal index
index += 1
def _parse_name(terminators: str) -> str:
nonlocal capture, index, size
start_index = index
while index < size and capture[index] not in terminators:
if index == start_index:
if capture[index] != "_" and not capture[index].isalpha():
_invalid("invalid first captured name character")
else:
if capture[index] != "_" and not capture[index].isalnum():
_invalid("invalid captured name character")
index += 1
if index == start_index:
_invalid("empty captured name")
return capture[start_index:index]
def _parse_regexp() -> str:
nonlocal capture, index, size
if not _is_next(":"):
return ""
index += 1
start_index = index
while index < size and capture[index] != "}":
index += 1
if index == start_index:
_invalid("empty captured regexp")
return _glob2re(capture[start_index:index])
def _parse_two_stars() -> None:
name = _parse_name("}")
regexp = _parse_regexp() or ".*"
_expect_close()
nonlocal capture, index, size, results
if results and results[-1] == "/" and index < size and capture[index] == "/":
index += 1
_append_regexp(name, regexp, "(?:", "/)?")
else:
_append_regexp(name, regexp)
def _parse_one_star() -> None:
name = _parse_name(":}")
regexp = _parse_regexp() or "[^/]*"
_expect_close()
_append_regexp(name, regexp)
def _append_regexp(name: str, regexp: str, prefix: str = "", suffix: str = "") -> None:
nonlocal results
results.append(prefix)
if not name.startswith("_"):
results.append("(?P<")
results.append(name)
results.append(">")
results.append(regexp)
if not name.startswith("_"):
results.append(")")
results.append(suffix)
while index < size:
char = capture[index]
index += 1
if char == "}":
if _is_next("}"):
results.append("}")
index += 1
else:
_invalid("unescaped }")
elif char == "{":
if _is_next("{"):
results.append("{")
index += 1
elif _is_next("*"):
index += 1
if _is_next("*"):
index += 1
_parse_two_stars()
else:
_parse_one_star()
else:
_invalid("unescaped { not followed by a *")
elif char in "*?[]":
_invalid(f"unescaped {char} outside capture {{*name:...}}")
else:
results.append(re.escape(char))
return re.compile("".join(results))
def _glob2re(glob: str) -> str: # pylint: disable=too-many-branches,redefined-outer-name
"""
Translate a ``glob`` pattern to the equivalent ``re.Pattern`` (as a string).
This is subtly different from ``fnmatch.translate`` since we use it to match the result of a successful ``glob``
rather than to actually perform the ``glob``.
"""
index = 0
size = len(glob)
results: List[str] = []
while index < size:
char = glob[index]
index += 1
if char == "*":
if index < size and glob[index] == "*":
index += 1
if results and results[-1] == "/" and index < size and glob[index] == "/":
results.append("(.*/)?")
index += 1
else:
results.append(".*")
else:
results.append("[^/]*")
elif char == "?":
results.append("[^/]")
elif char == "[":
end_index = index
while end_index < size and glob[end_index] != "]":
end_index += 1
if end_index >= size:
results.append("\\[")
else:
characters = glob[index:end_index].replace("\\", "\\\\")
index = end_index + 1
results.append("[")
if characters[0] == "!":
results.append("^/")
characters = characters[1:]
elif characters[0] == "^":
results.append("\\")
results.append(characters)
results.append("]")
elif char in "{}/":
results.append(char)
else:
results.append(re.escape(char))
return "".join(results)
[docs]def capture2glob(capture: str) -> str: # pylint: disable=too-many-statements
"""
Translate a capture pattern to the equivalent ``glob`` pattern.
"""
index = 0
size = len(capture)
results: List[str] = []
def _is_next(expected: str) -> bool:
nonlocal capture, index, size
return index < size and capture[index] == expected
def _invalid(reason: str = "") -> None:
nonlocal capture, index
raise ValueError(f'Invalid capture pattern:\n{capture}\n{index * " "}^ {reason}')
def _parse_glob(glob: str, terminators: str) -> None: # pylint: disable=redefined-outer-name
nonlocal capture, index, size
start_index = index
while index < size and capture[index] not in terminators:
if index == start_index:
if capture[index] != "_" and not capture[index].isalpha():
_invalid("invalid first captured name character")
else:
if capture[index] != "_" and not capture[index].isalnum():
_invalid("invalid captured name character")
index += 1
if index == start_index:
_invalid("empty captured name")
if index < size and capture[index] == ":":
index += 1
start_index = index
while index < size and capture[index] != "}":
index += 1
if index == start_index:
_invalid("empty captured regexp")
glob = capture[start_index:index]
if not _is_next("}"):
_invalid("missing }")
index += 1
results.append(glob)
while index < size:
char = capture[index]
index += 1
if char == "}":
if _is_next("}"):
results.append("}")
index += 1
else:
_invalid("unescaped }")
elif char == "{":
if _is_next("{"):
results.append("{")
index += 1
elif _is_next("*"):
index += 1
if _is_next("*"):
index += 1
_parse_glob("**", "}")
else:
_parse_glob("*", ":}")
else:
_invalid("unescaped { not followed by a *")
elif char in "*?[]":
_invalid(f"unescaped {char} outside capture {{*name:...}}")
else:
results.append(char)
return "".join(results)