Source code for harvester.autoparser.path_patterns

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module defines path-constructor functions and containers for data.

Containers are later used for validation of the paths in other examples and
for generator, which creates the parser.
"""


# Functions & objects =========================================================
[docs]class NeighCall(object): """ Class used to store informations about neighbour calls, generated by :func:`_neighbour_to_path_call`. Attributes: tag_name (str): Name of the container for the data. params (dict): Parameters for the fontainer. fn_params (list): Parameters for the fuction which will find neighbour (see :func:`.has_neigh`). """ def __init__(self, tag_name, params, fn_params): self.tag_name = tag_name self.params = params self.fn_params = fn_params
[docs]class PathCall(object): """ Container used to hold data, which will be used as parameter to call search functions in `DOM`. Arguments: call_type (str): Determines type of the call to the HTMLElement method. index (int): Index of the item after `call_type` function is called. params (dict): Another parameters for `call_type` function. """ def __init__(self, call_type, index, params): self.call_type = call_type self.index = index self.params = params
[docs]class Chained(object): """ Container to hold parameters of the chained calls. Arguments: chain (list): List of :class:`PathCall` classes. """ def __init__(self, chain): # necesarry because of reversed() and other iterator-returning # functions self.chain = list(chain) @property
[docs] def call_type(self): """ Property added to make sure, that :class:`Chained` is interchangeable with :class:`PathCall`. """ return "Chained"
def _params_or_none(params): """ `params` if `params`, else `None`. What else to say.. """ return params if params else None def _neighbour_to_path_call(neig_type, neighbour, element): """ Get :class:`PathCall` from `neighbour` and `element`. Args: neigh_type (str): `left` for left neighbour, `right` for .. This is used to determine :attr:`PathCall.call_type` of returned object. neighbour (obj): Reference to `neighbour` object. element (obj): Reference to HTMLElement holding required data. Returns: obj: :class:`PathCall` instance with data necessary to find `element` \ by comparing its `neighbour`. """ params = [None, None, neighbour.getContent().strip()] if neighbour.isTag(): params = [ neighbour.getTagName(), _params_or_none(neighbour.params), neighbour.getContent().strip() ] return PathCall( neig_type + "_neighbour_tag", 0, # TODO: Dynamic lookup NeighCall(element.getTagName(), _params_or_none(element.params), params) )
[docs]def neighbours_pattern(element): """ Look for negihbours of the `element`, return proper :class:`PathCall`. Args: element (obj): HTMLElement instance of the object you are looking for. Returns: list: List of :class:`PathCall` instances. """ # check if there are any neighbours if not element.parent: return [] parent = element.parent # filter only visible tags/neighbours neighbours = filter( lambda x: x.isTag() and not x.isEndTag() or x.getContent().strip() \ or x is element, parent.childs ) if len(neighbours) <= 1: return [] output = [] element_index = neighbours.index(element) # pick left neighbour if element_index >= 1: output.append( _neighbour_to_path_call( "left", neighbours[element_index - 1], element ) ) # pick right neighbour if element_index + 1 < len(neighbours): output.append( _neighbour_to_path_call( "right", neighbours[element_index + 1], element ) ) return output
[docs]def predecesors_pattern(element, root): """ Look for `element` by its predecesors. Args: element (obj): HTMLElement instance of the object you are looking for. root (obj): Root of the `DOM`. Returns: list: ``[PathCall()]`` - list with one :class:`PathCall` object (to \ allow use with ``.extend(predecesors_pattern())``). """ def is_root_container(el): return el.parent.parent.getTagName() == "" if not element.parent or not element.parent.parent or \ is_root_container(element): return [] trail = [ [ element.parent.parent.getTagName(), _params_or_none(element.parent.parent.params) ], [ element.parent.getTagName(), _params_or_none(element.parent.params) ], [element.getTagName(), _params_or_none(element.params)], ] match = root.match(*trail) if element in match: return [ PathCall("match", match.index(element), trail) ]