#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to download metadata from `grada.cz`.
"""
# Imports =====================================================================
import httpkie
import dhtmlparser
from utils import handle_encodnig, get_first_content, normalize_url
from utils import self_test_idiom
from ..structures import Author
from ..structures import Publication
# Variables ===================================================================
BASE_URL = "http://www.grada.cz"
URL = BASE_URL + "/novinky/?start=0&krok=100"
DOWNER = httpkie.Downloader()
# Functions & objects =========================================================
[docs]def _parse_alt_title(html_chunk):
"""
Parse title from alternative location if not found where it should be.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
str: Book's title.
"""
title = html_chunk.find(
"input",
{"src": "../images_buttons/objednat_off.gif"}
)
assert title, "Can't find alternative title!"
title = title[0]
assert "title" in title.params, "Can't find alternative title source!"
# title is stored as Bleh bleh: Title
title = title.params["title"].split(":", 1)[-1]
return title.strip()
[docs]def _parse_title_url(html_chunk):
"""
Parse title/name of the book and URL of the book.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
tuple: (title, url), both as strings.
"""
title = html_chunk.find("div", {"class": "comment"})
if not title:
return _parse_alt_title(html_chunk), None
title = title[0].find("h2")
if not title:
return _parse_alt_title(html_chunk), None
# look for the url of the book if present
url = None
url_tag = title[0].find("a")
if url_tag:
url = url_tag[0].params.get("href", None)
title = url_tag
return title[0].getContent(), normalize_url(BASE_URL, url)
[docs]def _parse_subtitle(html_chunk):
"""
Parse subtitle of the book.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Subtitle or None if subtitle wasn't found.
"""
subtitle = html_chunk.match(
["div", {"class": "comment"}],
"h2",
["span", {"class": "gray"}],
)
return get_first_content(subtitle)
[docs]def _parse_authors(html_chunk):
"""
Parse authors of the book.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
list: List of :class:`structures.Author` objects. Blank if no author \
found.
"""
authors = html_chunk.match(
["div", {"class": "comment"}],
"h3",
"a",
)
if not authors:
return []
authors = map(
lambda x: Author( # create Author objects
x.getContent().strip(),
normalize_url(BASE_URL, x.params.get("href", None))
),
authors
)
return filter(lambda x: x.name.strip(), authors)
[docs]def _parse_description(html_chunk):
"""
Parse description of the book.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Details as string with currency or None if not found.
"""
perex = html_chunk.find("div", {"class": "perex"})
return get_first_content(perex)
[docs]def _parse_format_pages_isbn(html_chunk):
"""
Parse format, number of pages and ISBN.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
tuple: (format, pages, isbn), all as string.
"""
ppi = get_first_content(
html_chunk.find("div", {"class": "price-overflow"})
)
if not ppi:
return None, None, None
# all information this function should parse are at one line
ppi = filter(lambda x: x.strip(), ppi.split("<br />"))[0]
# parse isbn
isbn = dhtmlparser.parseString(ppi)
isbn = isbn.find("b")
isbn = isbn[0].getContent() if isbn else None
# parse pages and format
pages = None
book_format = None
details = ppi.split("|")
if len(details) >= 2:
book_format = details[0].strip()
pages = details[1].strip()
return book_format, pages, isbn
[docs]def _parse_price(html_chunk):
"""
Parse price of the book.
Args:
html_chunk (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Price as string with currency or None if not found.
"""
price = get_first_content(
html_chunk.find("div", {"class": "prices"})
)
if not price:
return None
# it is always in format Cena:\n150kč
price = dhtmlparser.removeTags(price)
price = price.split("\n")[-1]
return price
[docs]def _process_book(html_chunk):
"""
Parse available informations about book from the book details page.
Args:
html_chunk (obj): HTMLElement containing slice of the page with
details.
Returns:
obj: :class:`structures.Publication` instance with book details.
"""
title, url = _parse_title_url(html_chunk)
book_format, pages, isbn = _parse_format_pages_isbn(html_chunk)
# required informations
pub = Publication(
title=title,
authors=_parse_authors(html_chunk),
price=_parse_price(html_chunk),
publisher="Grada"
)
# optional informations
pub.optionals.URL = url
pub.optionals.ISBN = isbn
pub.optionals.pages = pages
pub.optionals.format = book_format
pub.optionals.sub_title = _parse_subtitle(html_chunk)
pub.optionals.description = _parse_description(html_chunk)
return pub
[docs]def get_publications():
"""
Get list of publication offered by grada.cz.
Returns:
list: List of :class:`.Publication` objects.
"""
data = DOWNER.download(URL)
dom = dhtmlparser.parseString(
handle_encodnig(data)
)
book_list = dom.find("div", {"class": "item"})
books = []
for book in book_list:
books.append(
_process_book(book)
)
return books
[docs]def self_test():
"""
Perform basic selftest.
Returns:
True: When everything is ok.
Raises:
AssertionError: When there is some problem.
"""
return self_test_idiom(get_publications)