Source code for harvester.scrappers.utils

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module contains number of functions, which are used in the rest of the
scrappers submodule.
"""
# Imports =====================================================================
import dhtmlparser


# Functions & objects =========================================================
[docs]def _get_encoding(dom, default="utf-8"):
    """
    Try to look for meta tag in given `dom`.

    Args:
        dom (obj): pyDHTMLParser dom of HTML elements.
        default (default "utr-8"): What to use if encoding is not found in
                                   `dom`.

    Returns:
        str/default: Given encoding or `default` parameter if not found.
    """
    encoding = dom.find("meta", {"http-equiv": "Content-Type"})

    if not encoding:
        return default

    encoding = encoding[0].params.get("content", None)

    if not encoding:
        return default

    return encoding.lower().split("=")[-1]


[docs]def handle_encodnig(html):
    """
    Look for encoding in given `html`. Try to convert `html` to utf-8.

    Args:
        html (str): HTML code as string.

    Returns:
        str: HTML code encoded in UTF.
    """
    encoding = _get_encoding(
        dhtmlparser.parseString(
            html.split("</head>")[0]
        )
    )

    if encoding == "utf-8":
        return html

    return html.decode(encoding).encode("utf-8")


[docs]def get_first_content(el_list, alt=None, strip=True):
    """
    Return content of the first element in `el_list` or `alt`. Also return `alt`
    if the content string of first element is blank.

    Args:
        el_list (list): List of HTMLElement objects.
        alt (default None): Value returner when list or content is blank.
        strip (bool, default True): Call .strip() to content.

    Returns:
        str or alt: String representation of the content of the first element \
                    or `alt` if not found.
    """
    if not el_list:
        return alt

    content = el_list[0].getContent()

    if strip:
        content = content.strip()

    if not content:
        return alt

    return content


[docs]def is_absolute_url(url, protocol="http"):
    """
    Test whether `url` is absolute url (``http://domain.tld/something``) or
    relative (``../something``).

    Args:
        url (str): Tested string.
        protocol (str, default "http"): Protocol which will be seek at the
                 beginning of the `url`.

    Returns:
        bool: True if url is absolute, False if not.
    """
    if ":" not in url:
        return False

    protocol, rest = url.split(":", 1)

    if protocol.startswith(protocol) and rest.startswith("//"):
        return True

    return False


[docs]def normalize_url(base_url, rel_url):
    """
    Normalize the `url` - from relative, create absolute URL.

    Args:
        base_url (str): Domain with ``protocol://`` string
        rel_url (str): Relative or absolute url.

    Returns:
        str/None: Normalized URL or None if `url` is blank.
    """
    if not rel_url:
        return None

    if not is_absolute_url(rel_url):
        rel_url = rel_url.replace("../", "/")

        if (not base_url.endswith("/")) and (not rel_url.startswith("/")):
            return base_url + "/" + rel_url.replace("../", "/")

        return base_url + rel_url.replace("../", "/")

    return rel_url


[docs]def has_param(param):
    """
    Generate function, which will check `param` is in html element.

    This function can be used as parameter for .find() method in HTMLElement.
    """
    def has_param_closure(element):
        """
        Look for `param` in `element`.
        """
        if element.params.get(param, "").strip():
            return True

        return False

    return has_param_closure


[docs]def must_contain(tag_name, tag_content, container_tag_name):
    """
    Generate function, which checks if given element contains `tag_name` with
    string content `tag_content` and also another tag named
    `container_tag_name`.

    This function can be used as parameter for .find() method in HTMLElement.
    """
    def must_contain_closure(element):
        # containing in first level of childs <tag_name> tag
        matching_tags = element.match(tag_name, absolute=True)
        if not matching_tags:
            return False

        # which's content match `tag_content`
        if matching_tags[0].getContent() != tag_content:
            return False

        # and also contains <container_tag_name> tag
        if container_tag_name and \
           not element.match(container_tag_name, absolute=True):
            return False

        return True

    return must_contain_closure


[docs]def content_matchs(tag_content, content_transformer=None):
    """
    Generate function, which checks whether the content of the tag matchs
    `tag_content`.

    Args:
        tag_content (str): Content of the tag which will be matched thru whole
                           DOM.
        content_transformer (fn, default None): Function used to transform all
                            tags before matching.

    This function can be used as parameter for .find() method in HTMLElement.
    """
    def content_matchs_closure(element):
        if not element.isTag():
            return False

        cont = element.getContent()
        if content_transformer:
            cont = content_transformer(cont)

        return tag_content == cont

    return content_matchs_closure


[docs]def self_test_idiom(fn):
    """
    Perform basic selftest.

    Returns:
        True: When everything is ok.

    Raises:
        AssertionError: When there is some problem.
    """
    books = fn()

    assert len(books) > 0

    for book in books:
        error = "Book doesn't have all required parameters!\n"
        error += str(book.to_namedtuple())

        assert book.title, error
        assert book.authors is not None, error  # can be blank
        assert book.price, error
        assert book.publisher, error

        if book.optionals.ISBN:
            assert len(book.optionals.ISBN) >= 10

        if book.optionals.URL:
            protocol, rest = book.optionals.URL.split(":", 1)

            assert protocol.startswith("http")
            assert rest.startswith("//")

    return True