Source code for harvester.scrappers.utils

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module contains number of functions, which are used in the rest of the
scrappers submodule.
"""
# Imports =====================================================================
import dhtmlparser


# Functions & objects =========================================================
[docs]def _get_encoding(dom, default="utf-8"): """ Try to look for meta tag in given `dom`. Args: dom (obj): pyDHTMLParser dom of HTML elements. default (default "utr-8"): What to use if encoding is not found in `dom`. Returns: str/default: Given encoding or `default` parameter if not found. """ encoding = dom.find("meta", {"http-equiv": "Content-Type"}) if not encoding: return default encoding = encoding[0].params.get("content", None) if not encoding: return default return encoding.lower().split("=")[-1]
[docs]def handle_encodnig(html): """ Look for encoding in given `html`. Try to convert `html` to utf-8. Args: html (str): HTML code as string. Returns: str: HTML code encoded in UTF. """ encoding = _get_encoding( dhtmlparser.parseString( html.split("</head>")[0] ) ) if encoding == "utf-8": return html return html.decode(encoding).encode("utf-8")
[docs]def get_first_content(el_list, alt=None, strip=True): """ Return content of the first element in `el_list` or `alt`. Also return `alt` if the content string of first element is blank. Args: el_list (list): List of HTMLElement objects. alt (default None): Value returner when list or content is blank. strip (bool, default True): Call .strip() to content. Returns: str or alt: String representation of the content of the first element \ or `alt` if not found. """ if not el_list: return alt content = el_list[0].getContent() if strip: content = content.strip() if not content: return alt return content
[docs]def is_absolute_url(url, protocol="http"): """ Test whether `url` is absolute url (``http://domain.tld/something``) or relative (``../something``). Args: url (str): Tested string. protocol (str, default "http"): Protocol which will be seek at the beginning of the `url`. Returns: bool: True if url is absolute, False if not. """ if ":" not in url: return False protocol, rest = url.split(":", 1) if protocol.startswith(protocol) and rest.startswith("//"): return True return False
[docs]def normalize_url(base_url, rel_url): """ Normalize the `url` - from relative, create absolute URL. Args: base_url (str): Domain with ``protocol://`` string rel_url (str): Relative or absolute url. Returns: str/None: Normalized URL or None if `url` is blank. """ if not rel_url: return None if not is_absolute_url(rel_url): rel_url = rel_url.replace("../", "/") if (not base_url.endswith("/")) and (not rel_url.startswith("/")): return base_url + "/" + rel_url.replace("../", "/") return base_url + rel_url.replace("../", "/") return rel_url
[docs]def has_param(param): """ Generate function, which will check `param` is in html element. This function can be used as parameter for .find() method in HTMLElement. """ def has_param_closure(element): """ Look for `param` in `element`. """ if element.params.get(param, "").strip(): return True return False return has_param_closure
[docs]def must_contain(tag_name, tag_content, container_tag_name): """ Generate function, which checks if given element contains `tag_name` with string content `tag_content` and also another tag named `container_tag_name`. This function can be used as parameter for .find() method in HTMLElement. """ def must_contain_closure(element): # containing in first level of childs <tag_name> tag matching_tags = element.match(tag_name, absolute=True) if not matching_tags: return False # which's content match `tag_content` if matching_tags[0].getContent() != tag_content: return False # and also contains <container_tag_name> tag if container_tag_name and \ not element.match(container_tag_name, absolute=True): return False return True return must_contain_closure
[docs]def content_matchs(tag_content, content_transformer=None): """ Generate function, which checks whether the content of the tag matchs `tag_content`. Args: tag_content (str): Content of the tag which will be matched thru whole DOM. content_transformer (fn, default None): Function used to transform all tags before matching. This function can be used as parameter for .find() method in HTMLElement. """ def content_matchs_closure(element): if not element.isTag(): return False cont = element.getContent() if content_transformer: cont = content_transformer(cont) return tag_content == cont return content_matchs_closure
[docs]def self_test_idiom(fn): """ Perform basic selftest. Returns: True: When everything is ok. Raises: AssertionError: When there is some problem. """ books = fn() assert len(books) > 0 for book in books: error = "Book doesn't have all required parameters!\n" error += str(book.to_namedtuple()) assert book.title, error assert book.authors is not None, error # can be blank assert book.price, error assert book.publisher, error if book.optionals.ISBN: assert len(book.optionals.ISBN) >= 10 if book.optionals.URL: protocol, rest = book.optionals.URL.split(":", 1) assert protocol.startswith("http") assert rest.startswith("//") return True