Source code for harvester.scrappers.ben_cz

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to download last 100 books published by `ben.cz`.
"""
# Imports =====================================================================
import httpkie
import dhtmlparser

from utils import self_test_idiom

from ..structures import Author
from ..structures import Publication


# Variables ===================================================================
URL = "http://shop.ben.cz/"  #: Base url of the eshop.
URL += r"Produkty.aspx?lang=cz&nak=BEN+-+technick%u00e1+literatura"
DOWNER = httpkie.Downloader()
DOWNER.cookies = {
    "shop.ben.cz": {
        "pageSize": "100",
        "viewProductSize": "tabulka"
    }
}


# Functions & objects =========================================================
[docs]def _get_last_td(el): """ Return last <td> found in `el` DOM. Args: el (obj): :class:`dhtmlparser.HTMLElement` instance. Returns: obj: HTMLElement instance if found, or None if there are no <td> tags. """ if not el: return None if type(el) in [list, tuple, set]: el = el[0] last = el.find("td") if not last: return None return last[-1]
[docs]def _get_td_or_none(details, ID): """ Get <tr> tag with given `ID` and return content of the last <td> tag from <tr> root. Args: details (obj): :class:`dhtmlparser.HTMLElement` instance. ID (str): id property of the <tr> tag. Returns: str: Content of the last <td> as strign. """ content = details.find("tr", {"id": ID}) content = _get_last_td(content) # if content is None, return it if not content: return None content = content.getContent().strip() # if content is blank string, return None if not content: return None return content # Parsers =====================================================================
[docs]def _parse_title(dom, details): """ Parse title/name of the book. Args: dom (obj): HTMLElement containing whole HTML page. details (obj): HTMLElement containing slice of the page with details. Returns: str: Book's title. Raises: AssertionError: If title not found. """ title = details.find("h1") # if the header is missing, try to parse title from the <title> tag if not title: title = dom.find("title") assert title, "Can't find <title> tag!" return title[0].getContent().split("|")[0].strip() return title[0].getContent().strip()
[docs]def _parse_authors(details): """ Parse authors of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: list: List of :class:`structures.Author` objects. Blank if no author \ found. """ authors = details.find( "tr", {"id": "ctl00_ContentPlaceHolder1_tblRowAutor"} ) if not authors: return [] # book with unspecified authors # parse authors from HTML and convert them to Author objects author_list = [] for author in authors[0].find("a"): author_obj = Author(author.getContent()) if "href" in author.params: author_obj.URL = author.params["href"] author_list.append(author_obj) return author_list
[docs]def _parse_publisher(details): """ Parse publisher of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Publisher's name as string or None if not found. """ publisher = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowNakladatel" ) # publisher is not specified if not publisher: return None publisher = dhtmlparser.removeTags(publisher).strip() # return None instead of blank string if not publisher: return None return publisher
[docs]def _parse_price(details): """ Parse price of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Price as string with currency or None if not found. """ price = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowBeznaCena" ) return price
[docs]def _parse_pages_binding(details): """ Parse number of pages and binding of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: (pages, binding): Tuple with two string or two None. """ pages = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowRozsahVazba" ) if not pages: return None, None binding = None # binding info and number of pages is stored in same string if "/" in pages: binding = pages.split("/")[1].strip() pages = pages.split("/")[0].strip() if not pages: pages = None return pages, binding
[docs]def _parse_ISBN_EAN(details): """ Parse ISBN and EAN. Args: details (obj): HTMLElement containing slice of the page with details. Returns: (ISBN, EAN): Tuple with two string or two None. """ isbn_ean = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowIsbnEan" ) if not isbn_ean: return None, None ean = None isbn = None if "/" in isbn_ean: # ISBN and EAN are stored in same string isbn, ean = isbn_ean.split("/") isbn = isbn.strip() ean = ean.strip() else: isbn = isbn_ean.strip() if not isbn: isbn = None return isbn, ean
[docs]def _parse_edition(details): """ Parse edition (vydání) of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Edition as string with currency or None if not found. """ edition = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowVydani" ) return edition
[docs]def _parse_description(details): """ Parse description of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Details as string with currency or None if not found. """ description = details.find("div", {"class": "detailPopis"}) # description not found if not description: return None # remove links to ebook version ekniha = description[0].find("div", {"class": "ekniha"}) if ekniha: ekniha[0].replaceWith(dhtmlparser.HTMLElement("")) # remove links to other books from same cathegory detail = description[0].find("p", {"class": "detailKat"}) if detail: detail[0].replaceWith(dhtmlparser.HTMLElement("")) # remove all HTML elements description = dhtmlparser.removeTags(description[0]).strip() # description is blank if not description: return None return description
[docs]def _process_book(book_url): """ Parse available informations about book from the book details page. Args: book_url (str): Absolute URL of the book. Returns: obj: :class:`structures.Publication` instance with book details. """ data = DOWNER.download(book_url) dom = dhtmlparser.parseString(data) details_tags = dom.find("div", {"id": "contentDetail"}) assert details_tags, "Can't find details of the book." details = details_tags[0] # parse required informations title = _parse_title(dom, details) authors = _parse_authors(details) publisher = _parse_publisher(details) price = _parse_price(details) pages, binding = _parse_pages_binding(details) pub = Publication( title, authors, price, publisher ) # parse optional informations pub.optionals.URL = book_url pub.optionals.binding = binding pub.optionals.pages = pages pub.optionals.ISBN, pub.optionals.EAN = _parse_ISBN_EAN(details) pub.optionals.edition = _parse_edition(details) pub.optionals.description = _parse_description(details) return pub
[docs]def get_publications(): """ Get list of publication offered by ben.cz. Returns: list: List of :class:`structures.Publication` objects. """ data = DOWNER.download(URL) dom = dhtmlparser.parseString(data) book_list = dom.find("div", {"class": "seznamKniha"}) assert book_list, "Can't find <div> with class 'seznamKniha'!" books = [] for html_chunk in book_list: a = html_chunk.find("a") assert a, "Can't find link to the details of the book!" if a[0].find("span", {"class": "ruzek pripravujeme"}): continue books.append( _process_book(a[0].params["href"]) ) return books
[docs]def self_test(): """ Perform basic selftest. Returns: True: When everything is ok. Raises: AssertionError: When there is some problem. """ return self_test_idiom(get_publications)