Source code for harvester.scrappers.cpress_cz

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to download metadata informations from `cpress.cz`.
"""
# Imports =====================================================================
import httpkie
import dhtmlparser

from utils import handle_encodnig, get_first_content, normalize_url, has_param
from utils import must_contain, self_test_idiom

from ..structures import Author
from ..structures import Publication


# Variables ===================================================================
BASE_URL = "http://www.cpress.cz/"
URL = BASE_URL + "/novinky/"
DOWNER = httpkie.Downloader()


# Functions & objects =========================================================
[docs]def _parse_alt_title(html_chunk):
    """
    Parse title from alternative location if not found where it should be.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str: Book's title.
    """
    title = html_chunk.find("img", fn=has_param("alt"))

    if not title:
        raise UserWarning("Can't find alternative title source!")

    return title[0].params["alt"].strip()


[docs]def _parse_alt_url(html_chunk):
    """
    Parse URL from alternative location if not found where it should be.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str: Book's URL.
    """
    url_list = html_chunk.find("a", fn=has_param("href"))
    url_list = map(lambda x: x.params["href"], url_list)
    url_list = filter(lambda x: not x.startswith("autori/"), url_list)

    if not url_list:
        return None

    return normalize_url(BASE_URL, url_list[0])


[docs]def _parse_title_url(html_chunk):
    """
    Parse title/name of the book and URL of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        tuple: (title, url), both as strings.
    """
    url = None
    title_tags = html_chunk.match(
        ["div", {"class": "polozka_nazev"}],
        ["a", None, has_param("href")]
    )

    if not title_tags:
        return _parse_alt_title(html_chunk), _parse_alt_url(html_chunk)

    title = title_tags[0]

    url = normalize_url(BASE_URL, title.params["href"])
    title = title.getContent()

    if not title:
        title = _parse_alt_title(html_chunk)

    return title, url


[docs]def _parse_authors(html_chunk):
    """
    Parse authors of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        list: List of :class:`structures.Author` objects. Blank if no author \
              found.
    """
    authors_tags = html_chunk.match(
        ["div", {"class": "polozka_autor"}],
        "a"
    )

    authors = []
    for author_tag in authors_tags:
        # get name
        name = author_tag.getContent().strip()

        # skip tags without name
        if not name:
            continue

        # get url - if not found, set it to None
        url = author_tag.params.get("href", None)
        if url:
            url = normalize_url(BASE_URL, url)

        authors.append(
            Author(name, url)
        )

    return authors


[docs]def _parse_price(html_chunk):
    """
    Parse price of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Price as string with currency or None if not found.
    """
    price = html_chunk.find("span", {"class": "cena"})

    if not price:
        raise UserWarning("Price not found!")

    return get_first_content(price)


[docs]def _parse_from_table(html_chunk, what):
    """
    Go thru table data in `html_chunk` and try to locate content of the
    neighbor cell of the cell containing `what`.

    Returns:
        str: Table data or None.
    """
    ean_tag = html_chunk.find("tr", fn=must_contain("th", what, "td"))

    if not ean_tag:
        return None

    return get_first_content(ean_tag[0].find("td"))


[docs]def _parse_ean(html_chunk):
    """
    Parse EAN.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: EAN as string or None if not found.
    """
    return _parse_from_table(html_chunk, "EAN:")


[docs]def _parse_date(html_chunk):
    """
    Parse date.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: date as string or None if not found.
    """
    return _parse_from_table(html_chunk, "Datum vydání:")


[docs]def _parse_format(html_chunk):
    """
    Parse format.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Format as string or None if not found.
    """
    return _parse_from_table(html_chunk, "Formát:")


[docs]def _parse_description(html_chunk):
    """
    Parse description of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Description as string or None if not found.
    """
    description_tag = html_chunk.match(
        ["div", {"class": "kniha_detail_text"}],
        "p"
    )

    if not description_tag:
        return None

    description = get_first_content(description_tag)
    description = description.replace("<br />", "\n")
    description = description.replace("<br/>", "\n")

    return dhtmlparser.removeTags(description).strip()


[docs]def _process_book(html_chunk):
    """
    Parse available informations about book from the book details page.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        obj: :class:`structures.Publication` instance with book details.
    """
    title, book_url = _parse_title_url(html_chunk)

    # download page with details
    data = DOWNER.download(book_url)
    dom = dhtmlparser.parseString(
        handle_encodnig(data)
    )
    details = dom.find("div", {"id": "kniha_detail"})[0]

    # required parameters
    pub = Publication(
        title=title,
        authors=_parse_authors(html_chunk),
        price=_parse_price(details),
        publisher="CPress"
    )

    # optional parameters
    pub.optionals.URL = book_url
    pub.optionals.EAN = _parse_ean(details)
    pub.optionals.format = _parse_format(details)
    pub.optionals.pub_date = _parse_date(details)
    pub.optionals.description = _parse_description(details)

    return pub


[docs]def get_publications():
    """
    Get list of publication offered by cpress.cz.

    Returns:
        list: List of :class:`.Publication` objects.
    """
    data = DOWNER.download(URL)
    dom = dhtmlparser.parseString(
        handle_encodnig(data)
    )

    book_list = dom.find("div", {"class": "polozka"})

    books = []
    for book in book_list:
        books.append(
            _process_book(book)
        )

    return books


[docs]def self_test():
    """
    Perform basic selftest.

    Returns:
        True: When everything is ok.

    Raises:
        AssertionError: When there is some problem.
    """
    return self_test_idiom(get_publications)