Source code for harvester.autoparser.generator

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module contains number of template generators, which generates all the
python code for the parser.
"""
# Imports =====================================================================
import inspect

import utils
import conf_reader
import path_patterns


# Variables ===================================================================
IND = "    "  #: Indentation.


# Functions & objects =========================================================
[docs]def _index_idiom(el_name, index, alt=None):
    """
    Generate string where `el_name` is indexed by `index` if there are enough
    items or `alt` is returned.

    Args:
        el_name (str): Name of the `container` which is indexed.
        index (int): Index of the item you want to obtain from container.
        alt (whatever, default None): Alternative value.

    Returns:
        str: Python code.

    Live example::
        >>> import generator as g
        >>> print g._index_idiom("xex", 0)
            # pick element from list
            xex = xex[0] if xex else None
        >>> print g._index_idiom("xex", 1, "something")
        # pick element from list
        xex = xex[1] if len(xex) - 1 >= 1 else 'something'
    """
    el_index = "%s[%d]" % (el_name, index)

    if index == 0:
        cond = "%s" % el_name
    else:
        cond = "len(%s) - 1 >= %d" % (el_name, index)

    output = IND + "# pick element from list\n"

    return output + IND + "%s = %s if %s else %s\n\n" % (
        el_name,
        el_index,
        cond,
        repr(alt)
    )


[docs]def _required_idiom(tag_name, index, notfoundmsg):
    """
    Generate code, which make sure that `tag_name` has enoug items.

    Args:
        tag_name (str): Name of the container.
        index (int): Index of the item you want to obtain from container.
        notfoundmsg (str): Raise :class:`.UserWarning` with debug data and
                           following message.

    Returns:
        str: Python code.
    """
    cond = ""
    if index > 0:
        cond = " or len(el) - 1 < %d" % index

    tag_name = str(tag_name)

    output = IND + "if not el%s:\n" % cond
    output += IND + IND + "raise UserWarning(\n"
    output += IND + IND + IND + "%s +\n" % repr(notfoundmsg.strip() + "\n")
    output += IND + IND + IND + repr("Tag name: " + tag_name) + " + '\\n' +\n"
    output += IND + IND + IND + "'El:' + str(el) + '\\n' +\n"
    output += IND + IND + IND + "'Dom:' + str(dom)\n"
    output += IND + IND + ")\n\n"

    return output + IND + "el = el[%d]\n\n" % index


# parser template generators ##################################################
[docs]def _find_template(parameters, index, required=False, notfoundmsg=None):
    """
    Generate ``.find()`` call for HTMLElement.

    Args:
        parameters (list): List of parameters for ``.find()``.
        index (int): Index of the item you want to get from ``.find()`` call.
        required (bool, default False): Use :func:`_required_idiom` to returned
                 data.
        notfoundmsg (str, default None): Message which will be used for
                    :func:`_required_idiom` if the item is not found.

    Returns:
        str: Python code.

    Live example::
        >>> print g._find_template(["<xex>"], 3)
            el = dom.find('<xex>')
            # pick element from list
            el = el[3] if len(el) - 1 >= 3 else None
    """
    output = IND + "el = dom.find(%s)\n\n" % repr(parameters)[1:-1]

    if required:
        return output + _required_idiom(parameters[0], index, notfoundmsg)

    return output + _index_idiom("el", index)


[docs]def _wfind_template(use_dom, parameters, index, required=False,
                                                notfoundmsg=None):
    """
    Generate ``.wfind()`` call for HTMLElement.

    Args:
        use_dom (bool): Use ``dom`` as tag name. If ``False``, ``el`` is used.
        parameters (list): List of parameters for ``.wfind()``.
        index (int): Index of the item you want to get from ``.wfind()`` call.
        required (bool, default False): Use :func:`_required_idiom` to returned
                 data.
        notfoundmsg (str, default None): Message which will be used for
                    :func:`_required_idiom` if the item is not found.

    Returns:
        str: Python code.

    Live example::
        >>> print g._wfind_template(True, ["<xex>"], 3)
            el = dom.wfind('<xex>').childs
            # pick element from list
            el = el[3] if len(el) - 1 >= 3 else None
    """
    tag_name = "dom" if use_dom else "el"
    output = IND + "el = %s.wfind(%s).childs\n\n" % (
        tag_name,
        repr(parameters)[1:-1]
    )

    if required:
        return output + _required_idiom(parameters[0], index, notfoundmsg)

    return output + _index_idiom("el", index)


[docs]def _match_template(parameters, index, required=False, notfoundmsg=None):
    """
    Generate ``.match()`` call for HTMLElement.

    Args:
        parameters (list): List of parameters for ``.match()``.
        index (int): Index of the item you want to get from ``.match()`` call.
        required (bool, default False): Use :func:`_required_idiom` to returned
                 data.
        notfoundmsg (str, default None): Message which will be used for
                    :func:`_required_idiom` if the item is not found.

    Returns:
        str: Python code.

    Live example::
        >>> print g._match_template(["<xex>"], 3)
            el = dom.match('<xex>')
            # pick element from list
            el = el[3] if len(el) - 1 >= 3 else None
    """
    output = IND + "el = dom.match(%s)\n\n" % repr(parameters)[1:-1]

    #TODO: reduce None parameters

    if required:
        return output + _required_idiom(parameters[0], index, notfoundmsg)

    return output + _index_idiom("el", index)


[docs]def _neigh_template(parameters, index, left=True, required=False,
                                                  notfoundmsg=None):
    """
    Generate neighbour matching call for HTMLElement, which returns only
    elements with required neighbours.

    Args:
        parameters (list): List of parameters for ``.match()``.
        index (int): Index of the item you want to get from ``.match()`` call.
        left (bool, default True): Look for neigbour in the left side of el.
        required (bool, default False): Use :func:`_required_idiom` to returned
                 data.
        notfoundmsg (str, default None): Message which will be used for
                    :func:`_required_idiom` if the item is not found.

    Returns:
        str: Python code.
    """
    fn_string = "has_neigh(%s, left=%s)" % (
        repr(parameters.fn_params)[1:-1],
        repr(left)
    )

    output = IND + "el = dom.find(\n"
    output += IND + IND + "%s,\n" % repr(parameters.tag_name)

    if parameters.params:
        output += IND + IND + "%s,\n" % repr(parameters.params)

    output += IND + IND + "fn=%s\n" % fn_string
    output += IND + ")\n\n"

    if required:
        return output + _required_idiom(
            parameters.fn_params[0],
            index,
            notfoundmsg
        )

    return output + _index_idiom("el", index)

# /parser template generators #################################################


[docs]def _get_parser_name(var_name):
    """
    Parser name composer.

    Args:
        var_name (str): Name of the variable.

    Returns:
        str: Parser function name.
    """
    return "get_%s" % var_name


[docs]def _generate_parser(name, path, required=False, notfoundmsg=None):
    """
    Generate parser named `name` for given `path`.

    Args:
        name (str): Basename for the parsing function (see
                    :func:`_get_parser_name` for details).
        path (obj): :class:`.PathCall` or :class:`.Chained` instance.
        required (bool, default False): Use :func:`_required_idiom` to returned
                 data.
        notfoundmsg (str, default None): Message which will be used for
                    :func:`_required_idiom` if the item is not found.

    Returns:
        str: Python code for parsing `path`.
    """
    output = "def %s(dom):\n" % _get_parser_name(name)

    dom = True  # used specifically in _wfind_template
    parser_table = {
        "find": lambda path:
            _find_template(path.params, path.index, required, notfoundmsg),
        "wfind": lambda path:
            _wfind_template(
                dom,
                path.params,
                path.index,
                required,
                notfoundmsg
            ),
        "match": lambda path:
            _match_template(path.params, path.index, required, notfoundmsg),
        "left_neighbour_tag": lambda path:
            _neigh_template(
                path.params,
                path.index,
                True,
                required,
                notfoundmsg
            ),
        "right_neighbour_tag": lambda path:
            _neigh_template(
                path.params,
                path.index,
                False,
                required,
                notfoundmsg
            ),
    }

    if isinstance(path, path_patterns.PathCall):
        output += parser_table[path.call_type](path)
    elif isinstance(path, path_patterns.Chained):
        for path in path.chain:
            output += parser_table[path.call_type](path)
            dom = False
    else:
        raise UserWarning(
            "Unknown type of path parameters! (%s)" % str(path)
        )

    output += IND + "return el\n"
    output += "\n\n"

    return output


[docs]def _unittest_template(config):
    """
    Generate unittests for all of the generated code.

    Args:
        config (dict): Original configuration dictionary. See
               :mod:`~harvester.autoparser.conf_reader` for details.

    Returns:
        str: Python code.
    """
    output = "def test_parsers():\n"

    links = dict(map(lambda x: (x["link"], x["vars"]), config))

    for link in links.keys():
        output += IND + "# Test parsers against %s\n" % link
        output += IND + "html = handle_encodnig(\n"
        output += IND + IND + "_get_source(%s)\n" % repr(link)
        output += IND + ")\n"
        output += IND + "dom = dhtmlparser.parseString(html)\n"
        output += IND + "dhtmlparser.makeDoubleLinked(dom)\n\n"

        for var in links[link]:
            content = links[link][var]["data"].strip()

            output += IND + "%s = %s(dom)\n" % (var, _get_parser_name(var))

            if "\n" in content:
                output += IND
                output += "assert %s.getContent().strip().split() == %s" % (
                    var,
                    repr(content.split())
                )
            else:
                output += IND + "assert %s.getContent().strip() == %s" % (
                    var,
                    repr(content)
                )

            output += "\n\n"

    return output + "\n"


[docs]def generate_parsers(config, paths):
    """
    Generate parser for all `paths`.

    Args:
        config (dict): Original configuration dictionary used to get matches
                       for unittests. See
                       :mod:`~harvester.autoparser.conf_reader` for details.
        paths (dict): Output from :func:`.select_best_paths`.

    Returns:
        str: Python code containing all parsers for `paths`.
    """
    output = """#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# HTML parser generated by Autoparser
# (https://github.com/edeposit/edeposit.amqp.harvester)
#
import os
import os.path

import httpkie
import dhtmlparser


# Utilities
"""
    # add source of neighbour picking functions from utils.py
    output += inspect.getsource(conf_reader._get_source) + "\n\n"
    output += inspect.getsource(utils._get_encoding) + "\n\n"
    output += inspect.getsource(utils.handle_encodnig) + "\n\n"
    output += inspect.getsource(utils.is_equal_tag) + "\n\n"
    output += inspect.getsource(utils.has_neigh) + "\n\n"
    output += "# Generated parsers\n"

    for name, path in paths.items():
        path = path[0]  # pick path with highest priority

        required = config[0]["vars"][name].get("required", False)
        notfoundmsg = config[0]["vars"][name].get("notfoundmsg", "")

        output += _generate_parser(name, path, required, notfoundmsg)

    output += "# Unittest\n"
    output += _unittest_template(config)

    output += "# Run tests of the parser\n"
    output += "if __name__ == '__main__':\n"
    output += IND + "test_parsers()"

    return output