Source code for parsel.utils

from __future__ import annotations

import re
from typing import TYPE_CHECKING, Any, cast

from w3lib.html import replace_entities as w3lib_replace_entities

if TYPE_CHECKING:
    from collections.abc import Iterable, Iterator



[docs]
def flatten(x: Iterable[Any]) -> list[Any]:
    """flatten(sequence) -> list
    Returns a single, flat list which contains all elements retrieved
    from the sequence and all recursively contained sub-sequences
    (iterables).
    Examples:
    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
    >>> flatten(["foo", "bar"])
    ['foo', 'bar']
    >>> flatten(["foo", ["baz", 42], "bar"])
    ['foo', 'baz', 42, 'bar']
    """
    return list(iflatten(x))




[docs]
def iflatten(x: Iterable[Any]) -> Iterator[Any]:
    """iflatten(sequence) -> Iterator
    Similar to ``flatten()``, but returns iterator instead
    Examples:
    >>> list(iflatten([[1, 2], (3, 4)]))
    [1, 2, 3, 4]
    """
    for el in x:
        if _is_listlike(el):
            yield from flatten(el)
        else:
            yield el



def _is_listlike(x: Any) -> bool:
    """
    >>> _is_listlike("foo")
    False
    >>> _is_listlike(5)
    False
    >>> _is_listlike(b"foo")
    False
    >>> _is_listlike([b"foo"])
    True
    >>> _is_listlike((b"foo",))
    True
    >>> _is_listlike({})
    True
    >>> _is_listlike(set())
    True
    >>> _is_listlike((x for x in range(3)))
    True
    >>> _is_listlike(range(5))
    True
    """
    return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))



[docs]
def extract_regex(
    regex: str | re.Pattern[str], text: str, replace_entities: bool = True
) -> list[str]:
    """Extract a list of strings from the given text using the following policies:
    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """
    if isinstance(regex, str):
        regex = re.compile(regex, re.UNICODE)

    if "extract" in regex.groupindex:
        # named group
        try:
            extracted = cast("re.Match[str]", regex.search(text)).group("extract")
        except AttributeError:
            strings = []
        else:
            strings = [extracted] if extracted is not None else []
    else:
        # full regex or numbered groups
        strings = regex.findall(text)

    strings = flatten(strings)
    if not replace_entities:
        return strings
    return [w3lib_replace_entities(s, keep=["lt", "amp"]) for s in strings]




[docs]
def shorten(text: str, width: int, suffix: str = "...") -> str:
    """Truncate the given text to fit in the given width."""
    if len(text) <= width:
        return text
    if width > len(suffix):
        return text[: width - len(suffix)] + suffix
    if width >= 0:
        return suffix[len(suffix) - width :]
    raise ValueError("width must be equal or greater than 0")