Source code for harvester.filters.aleph_filter

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to skip Publications, which are already in Aleph.

Note:
    The module is using fuzzy lookup, see :func:`name_to_vector` and
    :func:`compare_names`.
"""
# Imports =====================================================================
import unicodedata

import edeposit.amqp.aleph as aleph


# Functions & objects =========================================================
[docs]def name_to_vector(name):
    """
    Convert `name` to the ASCII vector.

    Example:
        >>> name_to_vector("ing. Franta Putšálek")
        ['putsalek', 'franta', 'ing']

    Args:
        name (str): Name which will be vectorized.

    Returns:
        list: Vector created from name.
    """
    if not isinstance(name, unicode):
        name = name.decode("utf-8")

    name = name.lower()
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore')
    name = "".join(filter(lambda x: x.isalpha() or x == " ", list(name)))

    return sorted(name.split(), key=lambda x: len(x), reverse=True)


[docs]def compare_names(first, second):
    """
    Compare two names in complicated, but more error prone way.

    Algorithm is using vector comparison.

    Example:
        >>> compare_names("Franta Putšálek", "ing. Franta Putšálek")
        100.0
        >>> compare_names("F. Putšálek", "ing. Franta Putšálek")
        50.0

    Args:
        first (str): Fisst name as string.
        second (str): Second name as string.

    Returns:
        float: Percentage of the similarity.
    """
    first = name_to_vector(first)
    second = name_to_vector(second)

    zipped = zip(first, second)

    if not zipped:
        return 0

    similarity_factor = 0
    for fitem, _ in zipped:
        if fitem in second:
            similarity_factor += 1

    return (float(similarity_factor) / len(zipped)) * 100


[docs]def filter_publication(publication, cmp_authors=True):
    """
    Filter publications based at data from Aleph.

    Args:
        publication (obj): :class:`.Publication` instance.

    Returns:
        obj/None: None if the publication was found in Aleph or `publication` \
                  if not.
    """
    query = None
    isbn_query = False

    # there can be ISBN query or book title query
    if publication.optionals and publication.optionals.ISBN:
        query = aleph.ISBNQuery(publication.optionals.ISBN)
        isbn_query = True
    else:
        query = aleph.TitleQuery(publication.title)

    result = aleph.reactToAMQPMessage(aleph.SearchRequest(query), "")

    if not result.records:
        return publication  # book is not in database

    # if there was results with this ISBN, compare titles of the books
    # (sometimes, there are different books with same ISBN because of human
    # errors)
    if isbn_query:
        for record in result.records:
            epub = record.epublication

            # try to match title of the book
            if compare_names(epub.nazev, publication.title) >= 80:
                return None  # book already in database

        return publication

    # checks whether the details from returned EPublication match Publication's
    for record in result.records:
        epub = record.epublication

        # if the title doens't match, go to next record from aleph
        if not compare_names(epub.nazev, publication.title) >= 80:
            continue

        if not cmp_authors:
            return None  # book already in database

        # compare authors names
        for author in epub.autori:
            # convert Aleph's author structure to string
            author_str = "%s %s %s" % (
                author.firstName,
                author.lastName,
                author.title
            )

            # normalize author data from `publication`
            pub_authors = map(lambda x: x.name, publication.authors)
            if type(pub_authors) not in [list, tuple, set]:
                pub_authors = [pub_authors]

            # try to compare authors from `publication` and Aleph
            for pub_author in pub_authors:
                if compare_names(author_str, pub_author) >= 50:
                    return None  # book already in database

    return publication  # book is not in database