Source code for harvester.filters.aleph_filter

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
"""
This module is used to skip Publications, which are already in Aleph.

Note:
    The module is using fuzzy lookup, see :func:`name_to_vector` and
    :func:`compare_names`.
"""
# Imports =====================================================================
import unicodedata

import edeposit.amqp.aleph as aleph


# Functions & objects =========================================================
[docs]def name_to_vector(name): """ Convert `name` to the ASCII vector. Example: >>> name_to_vector("ing. Franta Putšálek") ['putsalek', 'franta', 'ing'] Args: name (str): Name which will be vectorized. Returns: list: Vector created from name. """ if not isinstance(name, unicode): name = name.decode("utf-8") name = name.lower() name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore') name = "".join(filter(lambda x: x.isalpha() or x == " ", list(name))) return sorted(name.split(), key=lambda x: len(x), reverse=True)
[docs]def compare_names(first, second): """ Compare two names in complicated, but more error prone way. Algorithm is using vector comparison. Example: >>> compare_names("Franta Putšálek", "ing. Franta Putšálek") 100.0 >>> compare_names("F. Putšálek", "ing. Franta Putšálek") 50.0 Args: first (str): Fisst name as string. second (str): Second name as string. Returns: float: Percentage of the similarity. """ first = name_to_vector(first) second = name_to_vector(second) zipped = zip(first, second) if not zipped: return 0 similarity_factor = 0 for fitem, _ in zipped: if fitem in second: similarity_factor += 1 return (float(similarity_factor) / len(zipped)) * 100
[docs]def filter_publication(publication, cmp_authors=True): """ Filter publications based at data from Aleph. Args: publication (obj): :class:`.Publication` instance. Returns: obj/None: None if the publication was found in Aleph or `publication` \ if not. """ query = None isbn_query = False # there can be ISBN query or book title query if publication.optionals and publication.optionals.ISBN: query = aleph.ISBNQuery(publication.optionals.ISBN) isbn_query = True else: query = aleph.TitleQuery(publication.title) result = aleph.reactToAMQPMessage(aleph.SearchRequest(query), "") if not result.records: return publication # book is not in database # if there was results with this ISBN, compare titles of the books # (sometimes, there are different books with same ISBN because of human # errors) if isbn_query: for record in result.records: epub = record.epublication # try to match title of the book if compare_names(epub.nazev, publication.title) >= 80: return None # book already in database return publication # checks whether the details from returned EPublication match Publication's for record in result.records: epub = record.epublication # if the title doens't match, go to next record from aleph if not compare_names(epub.nazev, publication.title) >= 80: continue if not cmp_authors: return None # book already in database # compare authors names for author in epub.autori: # convert Aleph's author structure to string author_str = "%s %s %s" % ( author.firstName, author.lastName, author.title ) # normalize author data from `publication` pub_authors = map(lambda x: x.name, publication.authors) if type(pub_authors) not in [list, tuple, set]: pub_authors = [pub_authors] # try to compare authors from `publication` and Aleph for pub_author in pub_authors: if compare_names(author_str, pub_author) >= 50: return None # book already in database return publication # book is not in database