Source code for harvester.filters.dup_filter

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This submodule is used to skip already parsed data.

Each `publication` parameter of the :func:`filter` is cached and if it is
called with same parameter again, None is retuned.

Note:
    Cache is using simple JSON serialization, so some form of cache persistency
    is granted. For path to the serialized data, look at
    :attr:`~harvester.settings.DUP_FILTER_FILE`.
"""
# Imports =====================================================================
import json
import os.path

from .. import settings


# Variables ===================================================================
_CACHE = None


# Functions & objects =========================================================
[docs]def save_cache(cache): """ Save cahce to the disk. Args: cache (set): Set with cached data. """ with open(settings.DUP_FILTER_FILE, "w") as f: f.write( json.dumps(list(cache)) )
[docs]def load_cache(): """ Load cache from the disk. Return: set: Deserialized data from disk. """ if not os.path.exists(settings.DUP_FILTER_FILE): return set() with open(settings.DUP_FILTER_FILE) as f: return set( json.loads(f.read()) )
[docs]def filter_publication(publication, cache=_CACHE): """ Deduplication function, which compares `publication` with samples stored in `cache`. If the match NOT is found, `publication` is returned, else None. Args: publication (obj): :class:`.Publication` instance. cache (obj): Cache which is used for lookups. Returns: obj/None: Depends whether the object is found in cache or not. """ if cache is None: cache = load_cache() if publication._get_hash() in cache: return None cache.update( [publication._get_hash()] ) save_cache(cache) return publication