Source code for kuha_oai_pmh_repo_handler.oai.records

#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2020 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
"""Define OAI records.

:note: This module has a strict dependency to :mod:`kuha_common.document_store.records`

Contains information for querying records from document store and
appending them to responses with :class:`OAIHeaders`, :class:`OAIRecord` and
:const:`SETS`.
"""

import logging
import re
from collections import namedtuple

from kuha_common.document_store import Study
from kuha_common.document_store.query import FilterKeyConstants
from kuha_oai_pmh_repo_handler.oai.constants import (
    OAI_REC_NAMESPACE_IDENTIFIER,
    OAI_REC_IDENTIFIER_PREFIX,
    REGEX_OAI_IDENTIFIER,
    REGEX_LOCAL_IDENTIFIER,
    REGEX_SETSPEC
)


#: Attribute to store set configuration
SetAttribute = namedtuple("Set", [
    # String used in <setName>
    'setname',
    # String used in <setSpec>
    'setspec',
    # Document store field for getting the record's setName value.
    'record_field_setname',
    # Document store field for getting the record's setSpec value.
    'record_field_setspec',
    # Document store field for querying records.
    'record_query_field',
    # Document store field for queying (distinct) set values
    'set_values_query_field'
])

#: Configuration for study_group set
SET_STUDY_GROUP = SetAttribute(
    setname='Study group',
    setspec=Study.study_groups.name.name,
    record_field_setname=Study.study_groups.attr_name,
    record_field_setspec=Study.study_groups.sub_name.name,
    record_query_field=Study.study_groups.sub_name,
    set_values_query_field=Study.study_groups
)
#: Configuration for language set
SET_LANGUAGE = SetAttribute(
    setname='Language',
    setspec=Study.study_titles.attr_language.name,
    record_field_setname=None,
    record_field_setspec=Study.study_titles.attr_language.name,
    record_query_field=Study.study_titles.attr_language,
    set_values_query_field=Study.study_titles.attr_language
)
#: Configuration for datakind set
SET_DATAKIND = SetAttribute(
    setname='Kind of data',
    setspec=Study.data_kinds.sub_name.name,
    record_field_setname=None,
    record_field_setspec=Study.data_kinds.sub_name.name,
    record_query_field=Study.data_kinds.sub_name,
    set_values_query_field=Study.data_kinds.sub_name
)
#: Supported sets
SETS = [
    SET_STUDY_GROUP,
    SET_LANGUAGE,
    SET_DATAKIND
]

#: Validation regex for setspec
REGEX_VALID_SETSPEC = re.compile(REGEX_SETSPEC)


[docs]def is_valid_setspec(candidate): """Validates setSpec value. :param candidate: setSpec value to validate. :type candidate: str :returns: True if valid, False if not. :rtype: bool """ return REGEX_VALID_SETSPEC.fullmatch(candidate) is not None
[docs]def get_record_query_field_by_setspec(setspec): """Get document store field to query for set value. :param setspec: setSpec field of the requested set. :type setspec: str :returns: document store field or None :rtype: :obj:`kuha_common.document_store.field_types.FieldAttribute` or None """ rval = None for set_ in SETS: if set_.setspec == setspec: rval = set_.record_query_field break return rval
[docs]def get_set_specs_from_ds_record(ds_record): """Get set specs from document store record. :param ds_record: One of the document store records. Currently only Study is supported. :type ds_record: Record object from :mod:`kuha_common.document_store.records` :returns: set specs for use in oai-headers. :rtype: dict """ _dict = ds_record.export_dict() set_specs = {} for set_ in SETS: values = set() value = set_.record_query_field.value_from_dict(_dict) if isinstance(value, list): for val in value: if not is_valid_setspec(val): logging.warning( "Discarding invalid setSpec value: %s", val ) continue values.add(val) elif value is not None: if not is_valid_setspec(value): logging.warning( "Discarding invalid setSpec value: %s", value ) continue values.add(value) if values: set_specs.update({set_.setspec: list(values)}) return set_specs
[docs]def get_sets_list_from_query_result(set_, query_result): """Get sets list from query results. Query is built on the basis of set attributes defined in this class. It is a distinct type of query, so the retuned object is not a document store record. This function accepts the results and builds a sets list with each cell containing setName and setSpec keys with their values. :param set_: set-attribute used for the query. :type set_: :obj:`SetAttribute` :param query_result: results from the query. :type query_result: dict :returns: list of sets to be used in list sets response. :rtype: list """ sets = [] for record in query_result[set_.set_values_query_field.path]: if set_.record_field_setname: _name = set_.record_field_setname.name if record.get(set_.record_field_setspec) is None: continue _set_spec = set_.setspec + ':' + \ record.get(set_.record_field_setspec, '') _set_name = record.get(_name, '') else: _set_name = '' _set_spec = set_.setspec + ':' + record if not is_valid_setspec(_set_spec): logging.warning( "Discarding invalid setSpec value: %s", _set_spec ) continue sets.append({'setName': _set_name, 'setSpec': _set_spec}) return sets
[docs]def get_query_filter_for_set(set_request): """Get filter to use for querying document store. Returns a dictionary to use for querying document store and filtering by requested set. Returns None if requested set does not exists or is unsupported. :param str set_request: requested set :returns: Query filter or None :rtype: dict or None """ colon_count = set_request.count(':') if colon_count == 0: key = set_request value = {FilterKeyConstants.exists: True} elif colon_count == 1: key, value = set_request.split(':') else: return None query_field = get_record_query_field_by_setspec(key) if query_field is None: return None return {query_field: value}
[docs]class OAIHeaders: r"""Represents OAI-PMH record headers. Store information of a single record's headers and document store fields to include in query. Provides methods to validate OAI-Identifiers and to iterate set specs list. :param identifier: local identifier of a record. :type identifier: str :param datestamp: last modified/updated datestamp. :type datestamp: str :param \*\*set_specs: key-value pairs of set specs for the record. """ #: Namespace identifier used to construct an OAI-Identifier #: Use None if wish to use local identifiers in OAI-responses. namespace_identifier = OAI_REC_NAMESPACE_IDENTIFIER #: Prefix for all identifiers when constructing an OAI-Identifier. identifier_oai_prefix = OAI_REC_IDENTIFIER_PREFIX identifier_separator = ':' #: Validation regex for OAI-Identifier valid_oai_identifier = re.compile(REGEX_OAI_IDENTIFIER) #: Validation regex for local identifier (a subset of oai-identifier) valid_identifier = re.compile(REGEX_LOCAL_IDENTIFIER) def __init__(self, identifier, datestamp, **set_specs): self.set_identifier(identifier) self.datestamp = datestamp self.set_specs = set_specs
[docs] @classmethod def from_ds_record(cls, ds_record): """Return :obj:`OAIHeaders` constructed from document store record. :note: Currently supports only Study :param ds_record: Document Store record. :type ds_record: Record object defined in :mod:`kuha_common.document_store.records` :returns: headers constructed from Document Store record. :rtype: :obj:`OAIHeaders` """ set_specs = get_set_specs_from_ds_record(ds_record) identifier = ds_record.study_number.get_value() datestamp = ds_record.get_updated() return OAIHeaders(identifier, datestamp, **set_specs)
[docs] @classmethod def set_namespace_identifier(cls, ns_id): """Set namespace identifier for all instances. :note: this will be validated afterwards in :meth:`set_identifier` :param ns_id: namespace identifier :type ns_id: str """ cls.namespace_identifier = ns_id
@classmethod def _is_valid_oai_identifier(cls, candidate): return cls.valid_oai_identifier.fullmatch(candidate) is not None @classmethod def _is_valid_identifier(cls, candidate): return cls.valid_identifier.fullmatch(candidate) is not None
[docs] @classmethod def as_local_id(cls, identifier): """Get local identifier part of OAI-Identifier. :param identifier: records identifier. :type identifier: str :returns: local identifier or None for invalid identifier. :rtype: str or None """ if cls.namespace_identifier: if not cls._is_valid_oai_identifier(identifier): return None discard = cls.identifier_oai_prefix +\ cls.identifier_separator +\ cls.namespace_identifier +\ cls.identifier_separator return identifier.replace(discard, '', 1) if not cls._is_valid_identifier(identifier): return None return identifier
[docs] @staticmethod def get_header_fields(): """Get header fields to query. These are the fields required to construct the OAI-HEADER in templates. Check that each OAI-SET field is found here. :note: currently supports only Study. :returns: list of fields to contain in query. :rtype: list """ return [ Study.study_number, Study.study_titles, Study.study_groups, Study.data_kinds, Study._metadata ]
[docs] def set_identifier(self, identifier): """Set identifier. If namespace_identifier is not None, will build an OAI-Identifier. The identifier will be validated and :exc:`ValueError` will be raised if the validation fails. :param identifier: Record's local identifier. :type identifier: str :raises: :exc:`ValueError` if validation fails. """ if self.namespace_identifier: candidate = self.identifier_separator.join([self.identifier_oai_prefix, self.namespace_identifier, identifier]) if not self._is_valid_oai_identifier(candidate): raise ValueError( "Invalid OAI-Identifier: {}".format(candidate) ) self.identifier = candidate else: if not self._is_valid_identifier(identifier): raise ValueError( "Invalid identifier: {}".format(identifier) ) self.identifier = identifier
[docs] def get_identifier(self): """Get identifer :returns: record's identifier. :rtype: str """ return self.identifier
[docs] def get_datestamp(self): """Get records datestamp :returns: record's datestamp :rtype: str """ return self.datestamp
[docs] def iterate_set_specs(self): """Iterate over setSpec key-value pairs. :returns: Generator object for iterating over setSpec key-value pairs. :rtype: Generator """ for spec_key, specs in self.set_specs.items(): for spec in specs: yield spec_key, spec
[docs]class OAIRecord: """Class stores record and headers. :param study: Document Store study record. :type study: :obj:`kuha_common.document_store.records.Study` """ def __init__(self, study): self.study = study self.headers = OAIHeaders.from_ds_record(study) self.variables = [] self.questions = {}
[docs] def add_variable(self, variable): """Add variable to OAIRecord. :param variable: Document Store variable. :type variable: :obj:`kuha_common.document_store.records.Variable` """ self.variables.append(variable)
[docs] def add_question(self, question): """Add question to OAIRecord. Question lookup is done by variable name. Therefore it makes sense to use a dictionary with variable_name as key. The key content will be a list, since a variable may refer multiple questions. :note: questions without variable_name will be discarded and a warning will be logged. :param question: Document Store question. :type question: :obj:`kuha_common.document_store.records.Question` """ if not question.variable_name.get_value(): logging.warning("Discarding question without variable_name") return if question.variable_name.get_value() not in self.questions: self.questions.update({question.variable_name.get_value(): [question]}) else: self.questions[question.variable_name.get_value()].append(question)
[docs] def get_questions_by_variable(self, variable): """Get questions for OAIRecord by variable. Lookup questions by variable's variable_name. :param variable: Document Store variable. :type variable: :obj:`kuha_common.document_store.records.Variable` :returns: List of :obj:`kuha_common.document_store.records.Question` :rtype: list """ return self.questions.get(variable.variable_name.get_value(), [])
[docs] def iter_relpubls(self): """Iterates related publications by distinct description and lang. Generator yields two-tuples ('lang_desc', 'relpubls'): 'lang_desc' is a two-tuple with first item being the related publication description and the second item being the language of the relpubl element. 'relpubls' is a list containing all bibliographic citation contents of the related publication. :returns: generator that yields tuples (lang_desc, relpubls) """ relpubls = {} for relpubl in self.study.related_publications: key = (relpubl.get_language(), relpubl.attr_description.get_value()) if key in relpubls: relpubls[key].append(relpubl) else: relpubls.update({key: [relpubl]}) for lang_desc, relpubls in relpubls.items(): yield lang_desc, relpubls