Source code for harvester.autoparser.generator

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module contains number of template generators, which generates all the
python code for the parser.
"""
# Imports =====================================================================
import inspect

import utils
import conf_reader
import path_patterns


# Variables ===================================================================
IND = "    "  #: Indentation.


# Functions & objects =========================================================
[docs]def _index_idiom(el_name, index, alt=None): """ Generate string where `el_name` is indexed by `index` if there are enough items or `alt` is returned. Args: el_name (str): Name of the `container` which is indexed. index (int): Index of the item you want to obtain from container. alt (whatever, default None): Alternative value. Returns: str: Python code. Live example:: >>> import generator as g >>> print g._index_idiom("xex", 0) # pick element from list xex = xex[0] if xex else None >>> print g._index_idiom("xex", 1, "something") # pick element from list xex = xex[1] if len(xex) - 1 >= 1 else 'something' """ el_index = "%s[%d]" % (el_name, index) if index == 0: cond = "%s" % el_name else: cond = "len(%s) - 1 >= %d" % (el_name, index) output = IND + "# pick element from list\n" return output + IND + "%s = %s if %s else %s\n\n" % ( el_name, el_index, cond, repr(alt) )
[docs]def _required_idiom(tag_name, index, notfoundmsg): """ Generate code, which make sure that `tag_name` has enoug items. Args: tag_name (str): Name of the container. index (int): Index of the item you want to obtain from container. notfoundmsg (str): Raise :class:`.UserWarning` with debug data and following message. Returns: str: Python code. """ cond = "" if index > 0: cond = " or len(el) - 1 < %d" % index tag_name = str(tag_name) output = IND + "if not el%s:\n" % cond output += IND + IND + "raise UserWarning(\n" output += IND + IND + IND + "%s +\n" % repr(notfoundmsg.strip() + "\n") output += IND + IND + IND + repr("Tag name: " + tag_name) + " + '\\n' +\n" output += IND + IND + IND + "'El:' + str(el) + '\\n' +\n" output += IND + IND + IND + "'Dom:' + str(dom)\n" output += IND + IND + ")\n\n" return output + IND + "el = el[%d]\n\n" % index # parser template generators ##################################################
[docs]def _find_template(parameters, index, required=False, notfoundmsg=None): """ Generate ``.find()`` call for HTMLElement. Args: parameters (list): List of parameters for ``.find()``. index (int): Index of the item you want to get from ``.find()`` call. required (bool, default False): Use :func:`_required_idiom` to returned data. notfoundmsg (str, default None): Message which will be used for :func:`_required_idiom` if the item is not found. Returns: str: Python code. Live example:: >>> print g._find_template(["<xex>"], 3) el = dom.find('<xex>') # pick element from list el = el[3] if len(el) - 1 >= 3 else None """ output = IND + "el = dom.find(%s)\n\n" % repr(parameters)[1:-1] if required: return output + _required_idiom(parameters[0], index, notfoundmsg) return output + _index_idiom("el", index)
[docs]def _wfind_template(use_dom, parameters, index, required=False, notfoundmsg=None): """ Generate ``.wfind()`` call for HTMLElement. Args: use_dom (bool): Use ``dom`` as tag name. If ``False``, ``el`` is used. parameters (list): List of parameters for ``.wfind()``. index (int): Index of the item you want to get from ``.wfind()`` call. required (bool, default False): Use :func:`_required_idiom` to returned data. notfoundmsg (str, default None): Message which will be used for :func:`_required_idiom` if the item is not found. Returns: str: Python code. Live example:: >>> print g._wfind_template(True, ["<xex>"], 3) el = dom.wfind('<xex>').childs # pick element from list el = el[3] if len(el) - 1 >= 3 else None """ tag_name = "dom" if use_dom else "el" output = IND + "el = %s.wfind(%s).childs\n\n" % ( tag_name, repr(parameters)[1:-1] ) if required: return output + _required_idiom(parameters[0], index, notfoundmsg) return output + _index_idiom("el", index)
[docs]def _match_template(parameters, index, required=False, notfoundmsg=None): """ Generate ``.match()`` call for HTMLElement. Args: parameters (list): List of parameters for ``.match()``. index (int): Index of the item you want to get from ``.match()`` call. required (bool, default False): Use :func:`_required_idiom` to returned data. notfoundmsg (str, default None): Message which will be used for :func:`_required_idiom` if the item is not found. Returns: str: Python code. Live example:: >>> print g._match_template(["<xex>"], 3) el = dom.match('<xex>') # pick element from list el = el[3] if len(el) - 1 >= 3 else None """ output = IND + "el = dom.match(%s)\n\n" % repr(parameters)[1:-1] #TODO: reduce None parameters if required: return output + _required_idiom(parameters[0], index, notfoundmsg) return output + _index_idiom("el", index)
[docs]def _neigh_template(parameters, index, left=True, required=False, notfoundmsg=None): """ Generate neighbour matching call for HTMLElement, which returns only elements with required neighbours. Args: parameters (list): List of parameters for ``.match()``. index (int): Index of the item you want to get from ``.match()`` call. left (bool, default True): Look for neigbour in the left side of el. required (bool, default False): Use :func:`_required_idiom` to returned data. notfoundmsg (str, default None): Message which will be used for :func:`_required_idiom` if the item is not found. Returns: str: Python code. """ fn_string = "has_neigh(%s, left=%s)" % ( repr(parameters.fn_params)[1:-1], repr(left) ) output = IND + "el = dom.find(\n" output += IND + IND + "%s,\n" % repr(parameters.tag_name) if parameters.params: output += IND + IND + "%s,\n" % repr(parameters.params) output += IND + IND + "fn=%s\n" % fn_string output += IND + ")\n\n" if required: return output + _required_idiom( parameters.fn_params[0], index, notfoundmsg ) return output + _index_idiom("el", index) # /parser template generators #################################################
[docs]def _get_parser_name(var_name): """ Parser name composer. Args: var_name (str): Name of the variable. Returns: str: Parser function name. """ return "get_%s" % var_name
[docs]def _generate_parser(name, path, required=False, notfoundmsg=None): """ Generate parser named `name` for given `path`. Args: name (str): Basename for the parsing function (see :func:`_get_parser_name` for details). path (obj): :class:`.PathCall` or :class:`.Chained` instance. required (bool, default False): Use :func:`_required_idiom` to returned data. notfoundmsg (str, default None): Message which will be used for :func:`_required_idiom` if the item is not found. Returns: str: Python code for parsing `path`. """ output = "def %s(dom):\n" % _get_parser_name(name) dom = True # used specifically in _wfind_template parser_table = { "find": lambda path: _find_template(path.params, path.index, required, notfoundmsg), "wfind": lambda path: _wfind_template( dom, path.params, path.index, required, notfoundmsg ), "match": lambda path: _match_template(path.params, path.index, required, notfoundmsg), "left_neighbour_tag": lambda path: _neigh_template( path.params, path.index, True, required, notfoundmsg ), "right_neighbour_tag": lambda path: _neigh_template( path.params, path.index, False, required, notfoundmsg ), } if isinstance(path, path_patterns.PathCall): output += parser_table[path.call_type](path) elif isinstance(path, path_patterns.Chained): for path in path.chain: output += parser_table[path.call_type](path) dom = False else: raise UserWarning( "Unknown type of path parameters! (%s)" % str(path) ) output += IND + "return el\n" output += "\n\n" return output
[docs]def _unittest_template(config): """ Generate unittests for all of the generated code. Args: config (dict): Original configuration dictionary. See :mod:`~harvester.autoparser.conf_reader` for details. Returns: str: Python code. """ output = "def test_parsers():\n" links = dict(map(lambda x: (x["link"], x["vars"]), config)) for link in links.keys(): output += IND + "# Test parsers against %s\n" % link output += IND + "html = handle_encodnig(\n" output += IND + IND + "_get_source(%s)\n" % repr(link) output += IND + ")\n" output += IND + "dom = dhtmlparser.parseString(html)\n" output += IND + "dhtmlparser.makeDoubleLinked(dom)\n\n" for var in links[link]: content = links[link][var]["data"].strip() output += IND + "%s = %s(dom)\n" % (var, _get_parser_name(var)) if "\n" in content: output += IND output += "assert %s.getContent().strip().split() == %s" % ( var, repr(content.split()) ) else: output += IND + "assert %s.getContent().strip() == %s" % ( var, repr(content) ) output += "\n\n" return output + "\n"
[docs]def generate_parsers(config, paths): """ Generate parser for all `paths`. Args: config (dict): Original configuration dictionary used to get matches for unittests. See :mod:`~harvester.autoparser.conf_reader` for details. paths (dict): Output from :func:`.select_best_paths`. Returns: str: Python code containing all parsers for `paths`. """ output = """#! /usr/bin/env python # -*- coding: utf-8 -*- # # Interpreter version: python 2.7 # # HTML parser generated by Autoparser # (https://github.com/edeposit/edeposit.amqp.harvester) # import os import os.path import httpkie import dhtmlparser # Utilities """ # add source of neighbour picking functions from utils.py output += inspect.getsource(conf_reader._get_source) + "\n\n" output += inspect.getsource(utils._get_encoding) + "\n\n" output += inspect.getsource(utils.handle_encodnig) + "\n\n" output += inspect.getsource(utils.is_equal_tag) + "\n\n" output += inspect.getsource(utils.has_neigh) + "\n\n" output += "# Generated parsers\n" for name, path in paths.items(): path = path[0] # pick path with highest priority required = config[0]["vars"][name].get("required", False) notfoundmsg = config[0]["vars"][name].get("notfoundmsg", "") output += _generate_parser(name, path, required, notfoundmsg) output += "# Unittest\n" output += _unittest_template(config) output += "# Run tests of the parser\n" output += "if __name__ == '__main__':\n" output += IND + "test_parsers()" return output