#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to download last 100 books published by `ben.cz`.
"""
# Imports =====================================================================
import httpkie
import dhtmlparser
from utils import self_test_idiom
from ..structures import Author
from ..structures import Publication
# Variables ===================================================================
URL = "http://shop.ben.cz/" #: Base url of the eshop.
URL += r"Produkty.aspx?lang=cz&nak=BEN+-+technick%u00e1+literatura"
DOWNER = httpkie.Downloader()
DOWNER.cookies = {
"shop.ben.cz": {
"pageSize": "100",
"viewProductSize": "tabulka"
}
}
# Functions & objects =========================================================
[docs]def _get_last_td(el):
"""
Return last <td> found in `el` DOM.
Args:
el (obj): :class:`dhtmlparser.HTMLElement` instance.
Returns:
obj: HTMLElement instance if found, or None if there are no <td> tags.
"""
if not el:
return None
if type(el) in [list, tuple, set]:
el = el[0]
last = el.find("td")
if not last:
return None
return last[-1]
[docs]def _get_td_or_none(details, ID):
"""
Get <tr> tag with given `ID` and return content of the last <td> tag from
<tr> root.
Args:
details (obj): :class:`dhtmlparser.HTMLElement` instance.
ID (str): id property of the <tr> tag.
Returns:
str: Content of the last <td> as strign.
"""
content = details.find("tr", {"id": ID})
content = _get_last_td(content)
# if content is None, return it
if not content:
return None
content = content.getContent().strip()
# if content is blank string, return None
if not content:
return None
return content
# Parsers =====================================================================
[docs]def _parse_title(dom, details):
"""
Parse title/name of the book.
Args:
dom (obj): HTMLElement containing whole HTML page.
details (obj): HTMLElement containing slice of the page with details.
Returns:
str: Book's title.
Raises:
AssertionError: If title not found.
"""
title = details.find("h1")
# if the header is missing, try to parse title from the <title> tag
if not title:
title = dom.find("title")
assert title, "Can't find <title> tag!"
return title[0].getContent().split("|")[0].strip()
return title[0].getContent().strip()
[docs]def _parse_authors(details):
"""
Parse authors of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
list: List of :class:`structures.Author` objects. Blank if no author \
found.
"""
authors = details.find(
"tr",
{"id": "ctl00_ContentPlaceHolder1_tblRowAutor"}
)
if not authors:
return [] # book with unspecified authors
# parse authors from HTML and convert them to Author objects
author_list = []
for author in authors[0].find("a"):
author_obj = Author(author.getContent())
if "href" in author.params:
author_obj.URL = author.params["href"]
author_list.append(author_obj)
return author_list
[docs]def _parse_publisher(details):
"""
Parse publisher of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Publisher's name as string or None if not found.
"""
publisher = _get_td_or_none(
details,
"ctl00_ContentPlaceHolder1_tblRowNakladatel"
)
# publisher is not specified
if not publisher:
return None
publisher = dhtmlparser.removeTags(publisher).strip()
# return None instead of blank string
if not publisher:
return None
return publisher
[docs]def _parse_price(details):
"""
Parse price of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Price as string with currency or None if not found.
"""
price = _get_td_or_none(
details,
"ctl00_ContentPlaceHolder1_tblRowBeznaCena"
)
return price
[docs]def _parse_pages_binding(details):
"""
Parse number of pages and binding of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
(pages, binding): Tuple with two string or two None.
"""
pages = _get_td_or_none(
details,
"ctl00_ContentPlaceHolder1_tblRowRozsahVazba"
)
if not pages:
return None, None
binding = None # binding info and number of pages is stored in same string
if "/" in pages:
binding = pages.split("/")[1].strip()
pages = pages.split("/")[0].strip()
if not pages:
pages = None
return pages, binding
[docs]def _parse_ISBN_EAN(details):
"""
Parse ISBN and EAN.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
(ISBN, EAN): Tuple with two string or two None.
"""
isbn_ean = _get_td_or_none(
details,
"ctl00_ContentPlaceHolder1_tblRowIsbnEan"
)
if not isbn_ean:
return None, None
ean = None
isbn = None
if "/" in isbn_ean: # ISBN and EAN are stored in same string
isbn, ean = isbn_ean.split("/")
isbn = isbn.strip()
ean = ean.strip()
else:
isbn = isbn_ean.strip()
if not isbn:
isbn = None
return isbn, ean
[docs]def _parse_edition(details):
"""
Parse edition (vydání) of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Edition as string with currency or None if not found.
"""
edition = _get_td_or_none(
details,
"ctl00_ContentPlaceHolder1_tblRowVydani"
)
return edition
[docs]def _parse_description(details):
"""
Parse description of the book.
Args:
details (obj): HTMLElement containing slice of the page with details.
Returns:
str/None: Details as string with currency or None if not found.
"""
description = details.find("div", {"class": "detailPopis"})
# description not found
if not description:
return None
# remove links to ebook version
ekniha = description[0].find("div", {"class": "ekniha"})
if ekniha:
ekniha[0].replaceWith(dhtmlparser.HTMLElement(""))
# remove links to other books from same cathegory
detail = description[0].find("p", {"class": "detailKat"})
if detail:
detail[0].replaceWith(dhtmlparser.HTMLElement(""))
# remove all HTML elements
description = dhtmlparser.removeTags(description[0]).strip()
# description is blank
if not description:
return None
return description
[docs]def _process_book(book_url):
"""
Parse available informations about book from the book details page.
Args:
book_url (str): Absolute URL of the book.
Returns:
obj: :class:`structures.Publication` instance with book details.
"""
data = DOWNER.download(book_url)
dom = dhtmlparser.parseString(data)
details_tags = dom.find("div", {"id": "contentDetail"})
assert details_tags, "Can't find details of the book."
details = details_tags[0]
# parse required informations
title = _parse_title(dom, details)
authors = _parse_authors(details)
publisher = _parse_publisher(details)
price = _parse_price(details)
pages, binding = _parse_pages_binding(details)
pub = Publication(
title,
authors,
price,
publisher
)
# parse optional informations
pub.optionals.URL = book_url
pub.optionals.binding = binding
pub.optionals.pages = pages
pub.optionals.ISBN, pub.optionals.EAN = _parse_ISBN_EAN(details)
pub.optionals.edition = _parse_edition(details)
pub.optionals.description = _parse_description(details)
return pub
[docs]def get_publications():
"""
Get list of publication offered by ben.cz.
Returns:
list: List of :class:`structures.Publication` objects.
"""
data = DOWNER.download(URL)
dom = dhtmlparser.parseString(data)
book_list = dom.find("div", {"class": "seznamKniha"})
assert book_list, "Can't find <div> with class 'seznamKniha'!"
books = []
for html_chunk in book_list:
a = html_chunk.find("a")
assert a, "Can't find link to the details of the book!"
if a[0].find("span", {"class": "ruzek pripravujeme"}):
continue
books.append(
_process_book(a[0].params["href"])
)
return books
[docs]def self_test():
"""
Perform basic selftest.
Returns:
True: When everything is ok.
Raises:
AssertionError: When there is some problem.
"""
return self_test_idiom(get_publications)