Source code for harvester.autoparser.utils

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module contains number of functions, which are used at multiple places in
autoparser.
"""
# Imports =====================================================================
import dhtmlparser


# Functions & objects =========================================================
def _get_encoding(dom, default="utf-8"):
    """
    Try to look for meta tag in given `dom`.

    Args:
        dom (obj): pyDHTMLParser dom of HTML elements.
        default (default "utr-8"): What to use if encoding is not found in
                                   `dom`.

    Returns:
        str/default: Given encoding or `default` parameter if not found.
    """
    encoding = dom.find("meta", {"http-equiv": "Content-Type"})

    if not encoding:
        return default

    encoding = encoding[0].params.get("content", None)

    if not encoding:
        return default

    return encoding.lower().split("=")[-1]


[docs]def handle_encodnig(html): """ Look for encoding in given `html`. Try to convert `html` to utf-8. Args: html (str): HTML code as string. Returns: str: HTML code encoded in UTF. """ encoding = _get_encoding( dhtmlparser.parseString( html.split("</head>")[0] ) ) if encoding == "utf-8": return html return html.decode(encoding).encode("utf-8")
[docs]def content_matchs(tag_content, content_transformer=None): """ Generate function, which checks whether the content of the tag matchs `tag_content`. Args: tag_content (str): Content of the tag which will be matched thru whole DOM. content_transformer (fn, default None): Function used to transform all tags before matching. Returns: bool: True for every matching tag. Note: This function can be used as parameter for ``.find()`` method in HTMLElement. """ def content_matchs_closure(element): if not element.isTag(): return False cont = element.getContent() if content_transformer: cont = content_transformer(cont) return tag_content == cont return content_matchs_closure
[docs]def is_equal_tag(element, tag_name, params, content): """ Check is `element` object match rest of the parameters. All checks are performed only if proper attribute is set in the HTMLElement. Args: element (obj): HTMLElement instance. tag_name (str): Tag name. params (dict): Parameters of the tag. content (str): Content of the tag. Returns: bool: True if everyhing matchs, False otherwise. """ if tag_name and tag_name != element.getTagName(): return False if params and not element.containsParamSubset(params): return False if content is not None and content.strip() != element.getContent().strip(): return False return True
[docs]def has_neigh(tag_name, params=None, content=None, left=True): """ This function generates functions, which matches all tags with neighbours defined by parameters. Args: tag_name (str): Tag has to have neighbour with this tagname. params (dict): Tag has to have neighbour with this parameters. params (str): Tag has to have neighbour with this content. left (bool, default True): Tag has to have neigbour on the left, or right (set to ``False``). Returns: bool: True for every matching tag. Note: This function can be used as parameter for ``.find()`` method in HTMLElement. """ def has_neigh_closure(element): if not element.parent \ or not (element.isTag() and not element.isEndTag()): return False # filter only visible tags/neighbours childs = element.parent.childs childs = filter( lambda x: (x.isTag() and not x.isEndTag()) \ or x.getContent().strip() or x is element, childs ) if len(childs) <= 1: return False ioe = childs.index(element) if left and ioe > 0: return is_equal_tag(childs[ioe - 1], tag_name, params, content) if not left and ioe + 1 < len(childs): return is_equal_tag(childs[ioe + 1], tag_name, params, content) return False return has_neigh_closure