Source code for harvester.edeposit_autoparser

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
import sys
import os.path
import argparse

import dhtmlparser

import autoparser.utils as utils
import autoparser.conf_reader as conf_reader
import autoparser.vectors as vectors
import autoparser.path_patterns as path_patterns
import autoparser.generator as generator
from autoparser.path_patterns import PathCall, Chained


# Functions & objects =========================================================
[docs]def _create_dom(data):
    """
    Creates doublelinked DOM from `data`.

    Args:
        data (str/HTMLElement): Either string or HTML element.

    Returns:
        obj: HTMLElement containing double linked DOM.
    """
    if not isinstance(data, dhtmlparser.HTMLElement):
        data = dhtmlparser.parseString(
            utils.handle_encodnig(data)
        )

    dhtmlparser.makeDoubleLinked(data)

    return data


[docs]def _locate_element(dom, el_content, transformer=None):
    """
    Find element containing `el_content` in `dom`. Use `transformer` function
    to content of all elements in `dom` in order to correctly transforming them
    to match them with `el_content`.

    Args:
        dom (obj): HTMLElement tree.
        el_content (str): Content of element will be picked from `dom`.
        transformer (fn, default None): Transforming function.

    Note:
        `transformer` parameter can be for example simple lambda::

            lambda x: x.strip()

    Returns:
        list: Matching HTMLElements.
    """
    return dom.find(
        None,
        fn=utils.content_matchs(el_content, transformer)
    )


[docs]def _match_elements(dom, matches):
    """
    Find location of elements matching patterns specified in `matches`.

    Args:
        dom (obj): HTMLElement DOM tree.
        matches (dict): Structure: ``{"var": {"data": "match", ..}, ..}``.

    Returns:
        dict: Structure: ``{"var": {"data": HTMLElement_obj, ..}, ..}``
    """
    out = {}
    for key, content in matches.items():
        pattern = content["data"].strip()
        if "\n" in pattern:
            pattern = pattern.split()
            transformer = lambda x: x.strip().split()
        else:
            transformer = lambda x: x.strip()

        matching_elements = _locate_element(
            dom,
            pattern,
            transformer=transformer
        )

        not_found_msg = content.get("notfoundmsg", "").replace("$name", key)
        if not not_found_msg.strip():
            not_found_msg = "Can't locate variable '%s' with content '%s'!" % (
                key,
                pattern,
            )
        content["notfoundmsg"] = not_found_msg

        # in case of multiple elements, find only elements with propert tagname
        tagname = content.get("tagname", "").strip().lower()
        if tagname:
            matching_elements = filter(
                lambda x: x.getTagName().strip().lower() == tagname,
                matching_elements
            )

        if not matching_elements:
            raise UserWarning(not_found_msg)

        if len(matching_elements) > 1:
            raise UserWarning(
                "Ambigious content '%s'!" % content
                + "Content was found in multiple elements!"
            )

        out[key] = matching_elements[0]

    return out


[docs]def _collect_paths(element):
    """
    Collect all possible path which leads to `element`.

    Function returns standard path from root element to this, reverse path,
    which uses negative indexes for path, also some pattern matches, like
    "this is element, which has neighbour with id 7" and so on.

    Args:
        element (obj): HTMLElement instance.

    Returns:
        list: List of :class:`.PathCall` and :class:`.Chained` objects.
    """
    output = []

    # look for element by parameters - sometimes the ID is unique
    path = vectors.el_to_path_vector(element)
    root = path[0]
    params = element.params if element.params else None
    match = root.find(element.getTagName(), params)

    if len(match) == 1:
        output.append(
            PathCall("find", 0, [element.getTagName(), params])
        )

    # look for element by neighbours
    output.extend(path_patterns.neighbours_pattern(element))

    # look for elements by patterns - element, which parent has tagname, and
    # which parent has tagname ..
    output.extend(path_patterns.predecesors_pattern(element, root))

    index_backtrack = []
    last_index_backtrack = []
    params_backtrack = []
    last_params_backtrack = []

    # look for element by paths from root to element
    for el in reversed(path):
        # skip root elements
        if not el.parent:
            continue

        tag_name = el.getTagName()
        match = el.parent.wfind(tag_name).childs
        index = match.index(el)

        index_backtrack.append(
            PathCall("wfind", index, [tag_name])
        )
        last_index_backtrack.append(
            PathCall("wfind", index - len(match), [tag_name])
        )

        # if element has some parameters, use them for lookup
        if el.params:
            match = el.parent.wfind(tag_name, el.params).childs
            index = match.index(el)

            params_backtrack.append(
                PathCall("wfind", index, [tag_name, el.params])
            )
            last_params_backtrack.append(
                PathCall("wfind", index - len(match), [tag_name, el.params])
            )
        else:
            params_backtrack.append(
                PathCall("wfind", index, [tag_name])
            )
            last_params_backtrack.append(
                PathCall("wfind", index - len(match), [tag_name])
            )

    output.extend([
        Chained(reversed(params_backtrack)),
        Chained(reversed(last_params_backtrack)),
        Chained(reversed(index_backtrack)),
        Chained(reversed(last_index_backtrack)),
    ])

    return output


[docs]def _is_working_path(dom, path, element):
    """
    Check whether the path is working or not.

    Aply proper search function interpreting `path` to `dom` and check, if
    returned object is `element`. If so, return ``True``, otherwise ``False``.

    Args:
        dom (obj): HTMLElement DOM.
        path (obj): :class:`.PathCall` Instance containing informations about
                    path and which function it require to obtain element the
                    path is pointing to.
        element (obj): HTMLElement instance used to decide whether `path`
                       points to correct `element` or not.

    Returns:
        bool: True if `path` correctly points to proper `element`.
    """
    def i_or_none(el, i):
        """
        Return ``el[i]`` if the list is not blank, or None otherwise.

        Args:
            el (list, tuple): Any indexable object.
            i (int): Index.

        Returns:
            obj: Element at index `i` if `el` is not blank, or ``None``.
        """
        if not el:
            return None

        return el[i]

    # map decoders of all paths to one dictionary to make easier to call them
    path_functions = {
        "find": lambda el, index, params:
            i_or_none(el.find(*params), index),
        "wfind": lambda el, index, params:
            i_or_none(el.wfind(*params).childs, index),
        "match": lambda el, index, params:
            i_or_none(el.match(*params), index),
        "left_neighbour_tag": lambda el, index, neigh_data:
            i_or_none(
                el.find(
                    neigh_data.tag_name,
                    neigh_data.params,
                    fn=utils.has_neigh(*neigh_data.fn_params, left=True)
                ),
                index
            ),
        "right_neighbour_tag": lambda el, index, neigh_data:
            i_or_none(
                el.find(
                    neigh_data.tag_name,
                    neigh_data.params,
                    fn=utils.has_neigh(*neigh_data.fn_params, left=False)
                ),
                index
            ),
    }

    # call all decoders and see what you get from them
    el = None
    if isinstance(path, PathCall):
        el = path_functions[path.call_type](dom, path.index, path.params)
    elif isinstance(path, Chained):
        for path in path.chain:
            dom = path_functions[path.call_type](dom, path.index, path.params)
            if not dom:
                return False
        el = dom
    else:
        raise UserWarning(
            "Unknown type of path parameters! (%s)" % str(path)
        )

    if not el:
        return False

    # test whether returned item is the item we are looking for
    return el.getContent().strip() == element.getContent().strip()


[docs]def select_best_paths(examples):
    """
    Process `examples`, select only paths that works for every example. Select
    best paths with highest priority.

    Args:
        examples (dict): Output from :func:`.read_config`.

    Returns:
        list: List of :class:`.PathCall` and :class:`.Chained` objects.
    """
    possible_paths = {}  # {varname: [paths]}

    # collect list of all possible paths to all existing variables
    for example in examples:
        dom = _create_dom(example["html"])
        matching_elements = _match_elements(dom, example["vars"])

        for key, match in matching_elements.items():
            if key not in possible_paths:  # TODO: merge paths together?
                possible_paths[key] = _collect_paths(match)

    # leave only paths, that works in all examples where, are required
    for example in examples:
        dom = _create_dom(example["html"])
        matching_elements = _match_elements(dom, example["vars"])

        for key, paths in possible_paths.items():
            if key not in matching_elements:
                continue

            possible_paths[key] = filter(
                lambda path: _is_working_path(
                    dom,
                    path,
                    matching_elements[key]
                ),
                paths
            )

    priorities = [
        "find",
        "left_neighbour_tag",
        "right_neighbour_tag",
        "wfind",
        "match",
        "Chained"
    ]
    priorities = dict(map(lambda x: (x[1], x[0]), enumerate(priorities)))

    # sort all paths by priority table
    for key in possible_paths.keys():
        possible_paths[key] = list(sorted(
            possible_paths[key],
            key=lambda x: priorities.get(x.call_type, 100)
        ))

    return possible_paths


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Autoparser - parser generator."
    )
    parser.add_argument(
        "-c",
        "--config",
        required=True,
        help="""YAML Configuration file used to specify paths to data and
                matches, which will be used to create generator."""
    )

    args = parser.parse_args()

    if not os.path.exists(args.config):
        sys.stderr.write("Can't open '%s'!\n" % args.config)
        sys.exit(1)

    config = conf_reader.read_config(args.config)

    if not config:
        sys.stderr.write("Configuration file '%s' is blank!\n" % args.config)
        sys.exit(1)

    paths = select_best_paths(config)
    print generator.generate_parsers(config, paths)