Source code for harvester.scrappers.grada_cz

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to download metadata from `grada.cz`.
"""
# Imports =====================================================================
import httpkie
import dhtmlparser

from utils import handle_encodnig, get_first_content, normalize_url
from utils import self_test_idiom

from ..structures import Author
from ..structures import Publication


# Variables ===================================================================
BASE_URL = "http://www.grada.cz"
URL = BASE_URL + "/novinky/?start=0&krok=100"
DOWNER = httpkie.Downloader()


# Functions & objects =========================================================
[docs]def _parse_alt_title(html_chunk):
    """
    Parse title from alternative location if not found where it should be.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str: Book's title.
    """
    title = html_chunk.find(
        "input",
        {"src": "../images_buttons/objednat_off.gif"}
    )

    assert title, "Can't find alternative title!"

    title = title[0]

    assert "title" in title.params, "Can't find alternative title source!"

    # title is stored as Bleh bleh: Title
    title = title.params["title"].split(":", 1)[-1]

    return title.strip()


[docs]def _parse_title_url(html_chunk):
    """
    Parse title/name of the book and URL of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        tuple: (title, url), both as strings.
    """
    title = html_chunk.find("div", {"class": "comment"})

    if not title:
        return _parse_alt_title(html_chunk), None

    title = title[0].find("h2")
    if not title:
        return _parse_alt_title(html_chunk), None

    # look for the url of the book if present
    url = None
    url_tag = title[0].find("a")
    if url_tag:
        url = url_tag[0].params.get("href", None)
        title = url_tag

    return title[0].getContent(), normalize_url(BASE_URL, url)


[docs]def _parse_subtitle(html_chunk):
    """
    Parse subtitle of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Subtitle or None if subtitle wasn't found.
    """
    subtitle = html_chunk.match(
        ["div", {"class": "comment"}],
        "h2",
        ["span", {"class": "gray"}],
    )

    return get_first_content(subtitle)


[docs]def _parse_authors(html_chunk):
    """
    Parse authors of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        list: List of :class:`structures.Author` objects. Blank if no author \
              found.
    """
    authors = html_chunk.match(
        ["div", {"class": "comment"}],
        "h3",
        "a",
    )

    if not authors:
        return []

    authors = map(
        lambda x: Author(                            # create Author objects
            x.getContent().strip(),
            normalize_url(BASE_URL, x.params.get("href", None))
        ),
        authors
    )

    return filter(lambda x: x.name.strip(), authors)


[docs]def _parse_description(html_chunk):
    """
    Parse description of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Details as string with currency or None if not found.
    """
    perex = html_chunk.find("div", {"class": "perex"})

    return get_first_content(perex)


[docs]def _parse_format_pages_isbn(html_chunk):
    """
    Parse format, number of pages and ISBN.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        tuple: (format, pages, isbn), all as string.
    """
    ppi = get_first_content(
        html_chunk.find("div", {"class": "price-overflow"})
    )

    if not ppi:
        return None, None, None

    # all information this function should parse are at one line
    ppi = filter(lambda x: x.strip(), ppi.split("<br />"))[0]

    # parse isbn
    isbn = dhtmlparser.parseString(ppi)
    isbn = isbn.find("b")
    isbn = isbn[0].getContent() if isbn else None

    # parse pages and format
    pages = None
    book_format = None
    details = ppi.split("|")

    if len(details) >= 2:
        book_format = details[0].strip()
        pages = details[1].strip()

    return book_format, pages, isbn


[docs]def _parse_price(html_chunk):
    """
    Parse price of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Price as string with currency or None if not found.
    """
    price = get_first_content(
        html_chunk.find("div", {"class": "prices"})
    )

    if not price:
        return None

    # it is always in format Cena:\n150kč
    price = dhtmlparser.removeTags(price)
    price = price.split("\n")[-1]

    return price


[docs]def _process_book(html_chunk):
    """
    Parse available informations about book from the book details page.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with
                          details.

    Returns:
        obj: :class:`structures.Publication` instance with book details.
    """
    title, url = _parse_title_url(html_chunk)
    book_format, pages, isbn = _parse_format_pages_isbn(html_chunk)

    # required informations
    pub = Publication(
        title=title,
        authors=_parse_authors(html_chunk),
        price=_parse_price(html_chunk),
        publisher="Grada"
    )

    # optional informations
    pub.optionals.URL = url
    pub.optionals.ISBN = isbn
    pub.optionals.pages = pages
    pub.optionals.format = book_format
    pub.optionals.sub_title = _parse_subtitle(html_chunk)
    pub.optionals.description = _parse_description(html_chunk)

    return pub


[docs]def get_publications():
    """
    Get list of publication offered by grada.cz.

    Returns:
        list: List of :class:`.Publication` objects.
    """
    data = DOWNER.download(URL)
    dom = dhtmlparser.parseString(
        handle_encodnig(data)
    )

    book_list = dom.find("div", {"class": "item"})

    books = []
    for book in book_list:
        books.append(
            _process_book(book)
        )

    return books


[docs]def self_test():
    """
    Perform basic selftest.

    Returns:
        True: When everything is ok.

    Raises:
        AssertionError: When there is some problem.
    """
    return self_test_idiom(get_publications)