Source code for harvester.autoparser.conf_reader

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
Functions which allows to read serialized informations for autoparser.
"""
# Imports =====================================================================
import os.path
import copy

import yaml
import httpkie


# Functions & objects =========================================================
def _get_source(link):
    """
    Return source of the `link` whether it is filename or url.

    Args:
        link (str): Filename or URL.

    Returns:
        str: Content.

    Raises:
        UserWarning: When the `link` couldn't be resolved.
    """
    if link.startswith("http://") or link.startswith("https://"):
        down = httpkie.Downloader()
        return down.download(link)

    if os.path.exists(link):
        with open(link) as f:
            return f.read()

    raise UserWarning("html: '%s' is neither URL or data!" % link)


def _process_config_item(item, dirname):
    """
    Process one item from the configuration file, which contains multiple items
    saved as dictionary.

    This function reads additional data from the config and do some
    replacements - for example, if you specify url, it will download data
    from this url and so on.

    Args:
        item (dict): Item, which will be processed.

    Note:
        Returned data format::
            {
                "link": "link to html page/file",
                "html": "html code from file/url",
                "vars": {
                    "varname": {
                        "data": "matching data..",
                        ...
                    }
                }
            }

    Returns:
        dict: Dictionary in format showed above.
    """
    item = copy.deepcopy(item)
    html = item.get("html", None)

    if not html:
        raise UserWarning("Can't find HTML source for item:\n%s" % str(item))

    # process HTML link
    link = html if "://" in html else os.path.join(dirname, html)
    del item["html"]

    # replace $name with the actual name of the field
    for key, val in item.items():
        if "notfoundmsg" in val:
            val["notfoundmsg"] = val["notfoundmsg"].replace("$name", key)

    return {
        "html": _get_source(link),
        "link": link,
        "vars": item
    }


[docs]def read_config(file_name): """ Read YAML file with configuration and pointers to example data. Args: file_name (str): Name of the file, where the configuration is stored. Returns: dict: Parsed and processed data (see :func:`_process_config_item`). Example YAML file:: html: simple_xml.xml first: data: i wan't this required: true notfoundmsg: Can't find variable $name. second: data: and this --- html: simple_xml2.xml first: data: something wanted required: true notfoundmsg: Can't find variable $name. second: data: another wanted thing """ dirname = os.path.dirname( os.path.abspath(file_name) ) dirname = os.path.relpath(dirname) # create utf-8 strings, not unicode def custom_str_constructor(loader, node): return loader.construct_scalar(node).encode('utf-8') yaml.add_constructor(u'tag:yaml.org,2002:str', custom_str_constructor) config = [] with open(file_name) as f: for item in yaml.load_all(f.read()): config.append( _process_config_item(item, dirname) ) return config