Source code for kuha_oai_pmh_repo_handler.metadataformats

"""Define metadata formats.

Metadataformats create contexts by calling oai_response object and
declare templates if needed. Metadataformats raise oai_errors if
needed.
"""
# Stdlib
import os.path
import datetime
import logging
from collections import namedtuple
# Kuha Common
from kuha_common.query import QueryController
from kuha_common.document_store import (
    query,
    client,
    Study,
    Variable,
    Question
)
from kuha_common.document_store.constants import REC_STATUS_DELETED
# Kuha OAI-PMH
from kuha_oai_pmh_repo_handler.genshi_loader import (
    add_template_folders,
    GenPlate
)
from kuha_oai_pmh_repo_handler.constants import TEMPLATE_FOLDER
from kuha_oai_pmh_repo_handler.oai.constants import (
    OAI_RESPONSE_LIST_SIZE,
    OAI_DEL_RECORDS_DECL_NO,
    OAI_DEL_RECORDS_DECL_TRANSIENT,
    OAI_DEL_RECORDS_DECL_PERSISTENT,
    OAI_DATESTAMP_GRANULARITY_DATETIME
)
# Local subpackage
from . import (
    const,
    exc,
    _mdsets
)


_logger = logging.getLogger(__name__)


_STORED = {'args_added': False,
           'configured': False,
           'deleted_records': None,
           'loaded_sets': None}


[docs]class DuplicateSetSpec(Exception):
    """Every OAI set must have a unique spec value"""


[docs]class MDFormat:
    """Base class for metadata formats.

    Defines common attributes and methods. Subclass to define metadataformats.
    """
    default_template_folders = [
        os.path.abspath(
            os.path.join(
                os.path.dirname(
                    os.path.realpath(__file__)), '..', TEMPLATE_FOLDER))]
    #: overridable controls how plugin discovery handles metadataformats with
    #: same mdprefix. Built-in metadataformats could be overridable, those developed
    #: as a plugin should not.
    overridable = False
    mdprefix = None
    mdschema = None
    mdnamespace = None
    study_class = Study
    variable_class = Variable
    question_class = Question
    list_size = OAI_RESPONSE_LIST_SIZE
    _deleted_records_default = OAI_DEL_RECORDS_DECL_TRANSIENT
    datestamp_granularity = OAI_DATESTAMP_GRANULARITY_DATETIME
    #: For convenience to facilitate easier subclassing of sets.
    #: The MDSet is tightly coupled with this class.
    MDSet = _mdsets.MDSet
    # All sets in repository.
    # Override to customize OAI sets. Note, that this attribute must
    # contain same objects for all metadataformat-objects in the
    # OAI-PMH repository.
    sets = [_mdsets.LanguageSet, _mdsets.StudyGroupsSet, _mdsets.DataKindSet, _mdsets.OpenAIREDataSet]

    def __init__(self, oai, corr_id_header):
        """Initialize base MDFormat.

        The oai argument wraps the oai-protocol. Its keys include
        'arguments', 'headers', 'errors', 'response', and values correspond to following
        objects:
        * arguments: :obj:`kuha_oai_pmh_repo_handler.oai.protocol.OAIArguments`
        * headers: :class:`kuha_oai_pmh_repo_handler.oai.protocol.OAIHeaders`
        * response: :obj:`kuha_oai_pmh_repo_handler.oai.protocol.OAIResponse`
        * errors: :mod:`kuha_oai_pmh_repo_handler.errors`

        :param oai: Object that wraps the oai-protocol.
        :param dict corr_id_header: CorrelationId HTTP-header.
        """
        if any(x is None for x in (self.mdprefix, self.mdschema, self.mdnamespace)):
            raise NotImplementedError("mdprefix, mdschema and mdnamespace must be defined in "
                                      "subclass")
        self._oai = oai
        self.corr_id_header = corr_id_header

[docs]    @classmethod
    def add_cli_args(cls, parser):
        """Add command line arguments to parser.

        Adds required command line arguments regarding metadataformats & sets.

        This should be called on program startup along with other
        command line argument definitions if the program is allowing
        configuration of metadataformats & sets.

        :param :obj:`configargparse.ArgumentParser` parser: Active command line parser.
        """
        if _STORED['args_added']:
            return
        # TODO where should the default template folder be declared?
        parser.add('--template-folder',
                   help='Folder containing XML templates',
                   default=cls.default_template_folders,
                   env_var='OPRH_TEMPLATES',
                   action='append',
                   type=str)
        parser.add('--oai-pmh-deleted-records',
                   help='Deleted records declaration for Identify verb.',
                   default=cls._deleted_records_default,
                   env_var='OPRH_DELETED_RECORDS',
                   choices=[OAI_DEL_RECORDS_DECL_NO,
                            OAI_DEL_RECORDS_DECL_TRANSIENT,
                            OAI_DEL_RECORDS_DECL_PERSISTENT])
        client.add_cli_args()
        query.add_cli_args()
        for set_ in cls.sets:
            set_.add_cli_args(parser)
        _STORED['args_added'] = True

[docs]    @classmethod
    def configure_sets(cls, settings):
        """Configure & load sets using settings.

        Calls configure() of each MDSet class stored in class variable
        'sets'. The configure() will be called with 'settings'-parameter.
        If the configure() return False the set will not be loaded,
        but will be discarded instead. Otherwise, the configured set
        will be stored in module level variable and used to serve
        OAI requests.

        :param :obj:`argparse.Namespace` setting: Loaded settings
        :raises: :exc:`DuplicateSetSpec` if two configured sets should
                 have duplicate value in 'spec' class level variable.
        """
        if _STORED['loaded_sets'] is not None:
            raise ValueError("Sets already loaded")
        _STORED['loaded_sets'] = {}
        for set_ in cls.sets:
            if set_.configure(settings) is False:
                # Discarding set
                _logger.info("Discarding OAI set '%s' with spec '%s'", set_, set_.spec)
                continue
            if set_.spec in _STORED['loaded_sets']:
                raise DuplicateSetSpec("Found duplicate spec value '%s'" % (set_.spec,))
            _STORED['loaded_sets'][set_.spec] = set_

[docs]    @classmethod
    def configure(cls, settings):
        """Configure metadataformats & sets using settings.

        :param :obj:`argparse.Namespace` setting: Loaded settings
        """
        if _STORED['configured']:
            return
        add_template_folders(*settings.template_folder)
        client.configure(settings)
        query.configure(settings)
        cls.configure_sets(settings)
        _STORED['deleted_records'] = settings.oai_pmh_deleted_records
        _STORED['configured'] = True

[docs]    @staticmethod
    def get_deleted_record():
        """Get DeletedRecord OAI-PMH property
        """
        if _STORED['deleted_records'] is None:
            raise ValueError("DeletedRecord is not configured. Call configure() first.")
        return _STORED['deleted_records']

    async def _header_fields(self):
        fields = [self.study_class._metadata,
                  self.study_class.study_number]
        for set_ in self._iter_initialized_sets():
            fields.extend(await set_.fields())
        return fields

    @property
    def _record_fields(self):
        """Implement in subclass"""
        raise NotImplementedError

    @staticmethod
    async def _min_increment_step(datetime_str):
        """Count smallest increment step from datetime string.

        :param datetime_str: string representation of a datetime.
                             Datetime must be represented either
                             by day's precision or by second's precision.
        :type datetime_str: str
        :returns: smallest increment step.
        :rtype: :obj:`datetime.timedelta`
        :raises: :exc:`ValueError` if string lenght is invalid.
        """
        if len(datetime_str) == 10:
            # day's precision
            increment = datetime.timedelta(days=1)
        elif len(datetime_str) == 20:
            # second's precision
            increment = datetime.timedelta(seconds=1)
        else:
            ValueError("Invalid datetime string: {}".format(datetime_str))
        return increment

[docs]    @classmethod
    def get_set(cls, setspec):
        """Get set matching 'setspec' value.

        :param str setspec: Set to lookup.
        :returns: Found set, which is a subclass of :class:`MDSet`
        :raises: :exc:`exc.NoSuchSet` if a set is not found.
        """
        for set_ in cls.sets:
            if setspec == set_.spec:
                return set_
        raise exc.NoSuchSet("Could not find set matching setspec '%s'" % (setspec,))

    @staticmethod
    def _iter_loaded_sets():
        loaded_sets = _STORED['loaded_sets'].values() if _STORED['loaded_sets'] else []
        for set_ in loaded_sets:
            yield set_

    def _iter_initialized_sets(self):
        for set_ in self._iter_loaded_sets():
            yield set_(self)

    @staticmethod
    def _get_loaded_set(setspec):
        loaded_set = _STORED['loaded_sets'].get(setspec)
        if loaded_set is None:
            raise exc.NoSuchSet("Could not find set matching setspec '%s'" % (setspec,))
        return loaded_set

    def _get_initialized_set(self, setspec):
        return self._get_loaded_set(setspec)(self)

    async def _set_filter(self, requested_set):
        colon_count = requested_set.count(':')
        if colon_count == 0:
            set_key = requested_set
            value = None
        elif colon_count == 1:
            set_key, value = requested_set.split(':')
        else:
            raise self._oai.errors.NoRecordsMatch()
        try:
            set_ = self._get_initialized_set(set_key)
        except exc.NoSuchSet:
            # This method is called when HTTP Request is using a set.
            # Therefore the condition is not a programming error but
            # an oaierror. Mask NoSuchSet and raise NoRecordsMatch.
            raise self._oai.errors.NoRecordsMatch()
        return await set_.filter(value)

    async def _prepare_get_record(self):
        if self._oai.response.records == []:
            raise self._oai.errors.IdDoesNotExist(context=self._oai.arguments.identifier)
        return await self._oai.response.get_record_response()

    async def _prepare_list_records(self):
        if self._oai.arguments.is_selective() and self._oai.response.records == []:
            raise self._oai.errors.NoRecordsMatch()
        return await self._oai.response.list_records_response()

    async def _metadata_response(self):
        await self._oai.response.set_metadata_format(self.mdschema, self.mdnamespace)
        _prepare_call = {
            self._oai.arguments.verb_value_get_record: self._prepare_get_record,
            self._oai.arguments.verb_value_list_records: self._prepare_list_records
        }[self._oai.arguments.verb]
        return await _prepare_call()

    async def _add_record(self, identifier, datestamp, record_objects, setspecs, deleted):
        headers = self._oai.headers(identifier, datestamp, deleted)
        for set_ in self._iter_loaded_sets():
            if set_.spec not in setspecs:
                raise ValueError("Setspecs for '%s' is missing. Cannot build sets."
                                 % (set_.spec,))
            for val in setspecs.pop(set_.spec):
                headers.add_set_spec(set_.spec, val)
        if setspecs != {}:
            raise ValueError("Found extra set information in metadataformat: '%s'" % (setspecs,))
        record_objects.update({'headers': headers})
        self._oai.response.records.append(record_objects)

    async def _get_identifier(self, study, **record_objs):
        """Get identifier from record objects.

        Override in subclass to declare specific identifier

        :param study: Study from document store.
        :returns: Identifier
        """
        return study.study_number.get_value()

    async def _on_record(self, study, **record_objs):
        identifier = await self._get_identifier(study, **record_objs)
        setspecs = {}
        for set_ in self._iter_initialized_sets():
            setspecs.update({set_.spec: await set_.get(study)})
        record_objs['study'] = study
        datestamp = study.get_deleted() if study.is_deleted() else study.get_updated()
        await self._add_record(identifier, datestamp, record_objs, setspecs, study.is_deleted())

    async def _has_record(self):
        _filter = await self._valid_record_filter()
        result = await QueryController().query_single(
            self.study_class, headers=self.corr_id_header,
            _filter=_filter, fields=self.study_class._id)
        return bool(result)

    async def _queryparams_from_resumption_token(self):
        MDQueryParams = namedtuple('MDQueryParams', ['skip', 'from_', 'until', 'set_'])
        self._oai.arguments.resumption_token.response_list_size = self.list_size
        _until = self._oai.arguments.resumption_token.until + await self._min_increment_step(
            self._oai.arguments.resumption_token.until_str)
        return MDQueryParams(skip=self._oai.arguments.resumption_token.cursor,
                             from_=self._oai.arguments.resumption_token.from_,
                             set_=self._oai.arguments.resumption_token.set_,
                             until=_until)

    async def _get_record(self):
        fields = await self._header_fields() + self._record_fields
        _filter = await self._valid_record_filter()
        await QueryController().query_single(
            self.study_class, on_record=self._on_record, headers=self.corr_id_header,
            _filter=_filter, fields=list(set(fields)))

    async def _valid_records_filter(self):
        """Return query filter that returns all valid records.

        Override in subclass to define specific filter requirements.

        :returns: Query filter
        :rtype: dict
        """
        return {}

    async def _valid_record_filter(self):
        """Return query filter that return a valid record.

        Override in subclass to define specific filter requirements.

        :returns: Query filter
        :rtype: dict
        """
        return {self.study_class.study_number: self._oai.arguments.get_local_identifier()}

    async def _list_request_filter(self, qparams):
        _filter = await self._valid_records_filter()
        if qparams.set_:
            _filter.update(await self._set_filter(qparams.set_))
        if qparams.from_ or qparams.until:
            _filter.update({self.study_class._metadata.attr_updated: {
                QueryController.fk_constants.from_: qparams.from_,
                QueryController.fk_constants.until: qparams.until}})
        return _filter

    async def _query_records(self, add_fields=None):
        add_fields = add_fields or []
        qparams = await self._queryparams_from_resumption_token()
        _filter = await self._list_request_filter(qparams)
        queryctrl = QueryController(headers=self.corr_id_header)
        count = await queryctrl.query_count(self.study_class, _filter=_filter)
        self._oai.arguments.resumption_token.complete_list_size = count
        fields = await self._header_fields() + add_fields
        await queryctrl.query_multiple(
            self.study_class, on_record=self._on_record, _filter=_filter,
            fields=list(set(fields)), limit=self.list_size, skip=qparams.skip)

    async def _list_records(self):
        await self._query_records(self._record_fields)

[docs]    async def get_earliest_datestamp(self):
        """Get earliest datestamp as python datetime object.

        :returns: earliest datestamp for this metadataformat.
        :rtype: :obj:`datetime.datetime`
        """
        datestamp = None
        study = await QueryController().query_single(
            self.study_class, headers=self.corr_id_header, fields=self.study_class._metadata,
            sort_order=1, sort_by=self.study_class._metadata.attr_updated)
        if study:
            datestamp = study._metadata.attr_updated.get_value()
        return datestamp

[docs]    async def list_sets(self):
        """Outputs all sets from all records in the whole repository.

        If overridden, this should be overridden in all subclasses.
        It should also have the same behaviour in all subclasses::

            async def _list_sets():
                ...
            class MyMetadataFormat(MDFormat):
                list_sets = _list_sets
        """
        if list(self._iter_loaded_sets()) == []:
            raise self._oai.errors.NoSetHierarchy()
        for set_ in self._iter_initialized_sets():
            await set_.query(self._oai.response.add_sets_element)

[docs]    async def list_identifiers(self):
        """Query record identifiers from backend.

        Queries records and raises NoRecordsMatch oai error if the
        request is selective and no records were found.
        """
        await self._query_records()
        if self._oai.arguments.is_selective() and self._oai.response.records == []:
            raise self._oai.errors.NoRecordsMatch()

[docs]    async def list_metadata_formats(self):
        """Adds information regarding this metadataformat to response.

        If the request contains an identifiers, first makes sure the
        record exists in backend, then adds the metadataformat
        information to response.
        """
        if self._oai.arguments.identifier:
            if await self._has_record():
                await self._oai.response.add_available_metadata_format(
                    self.mdprefix, self.mdschema, self.mdnamespace)
            return
        await self._oai.response.add_available_metadata_format(
            self.mdprefix, self.mdschema, self.mdnamespace)

    # Subclass defines implementation for the following methods.

[docs]    async def get_record(self):
        """Adds record to response.

        This is an abstract method that must be implemented in
        subclass. Note that also the correct templates needs to be
        defined in subclass via decoration.

        The implementation must query the backend for the requested
        record, raise OAI errors if needed and return the correct
        oai.response.context.

        :raises: :exc:`NotImplementedError`
        """
        raise NotImplementedError

[docs]    async def list_records(self):
        """Adds records to response.

        This is an abstract method that must be implemented in
        subclass. The subclass must also define the correct template
        via decoration.

        The implementation must query the backend for the requested
        records, raise OAI errors when needed and return the correct
        oai.response.context.

        :raises: :exc:`NotImplementedError`
        """
        raise NotImplementedError


[docs]class DCMetadataFormat(MDFormat):

    overridable = True
    mdprefix = 'oai_dc'
    mdschema = 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd'
    mdnamespace = 'http://www.openarchives.org/OAI/2.0/oai_dc/'

    @property
    def _record_fields(self):
        return [self.study_class.identifiers,
                self.study_class.principal_investigators,
                self.study_class.publishers,
                self.study_class.document_uris,
                self.study_class.abstract,
                self.study_class.keywords,
                self.study_class.publication_years,
                self.study_class.study_area_countries,
                self.study_class.data_collection_copyrights]

[docs]    @classmethod
    def add_cli_args(cls, parser):
        super().add_cli_args(parser)
        parser.add('--oai-pmh-list-size-oai-dc',
                   help='How many results should a list response contain for '
                   'OAI DC metadata',
                   default=OAI_RESPONSE_LIST_SIZE,
                   env_var='OPRH_OP_LIST_SIZE_OAI_DC',
                   type=int)

[docs]    @classmethod
    def configure(cls, settings):
        cls.list_size = settings.oai_pmh_list_size_oai_dc
        super().configure(settings)

    @GenPlate('get_record.xml', subtemplate='oai_dc.xml')
    async def get_record(self):
        await super()._get_record()
        return await super()._metadata_response()

    @GenPlate('list_records.xml', subtemplate='oai_dc.xml')
    async def list_records(self):
        await super()._list_records()
        return await super()._metadata_response()


[docs]class EAD3MetadataFormat(MDFormat):

    overridable = True
    mdprefix = 'ead3'
    mdschema = 'http://www.loc.gov/ead/ead3.xsd'
    mdnamespace = 'http://ead3.archivists.org/schema/'

    @property
    def _record_fields(self):
        return [self.study_class.publishers,
                self.study_class.file_names,
                self.study_class.document_uris,
                self.study_class.collection_periods,
                self.study_class.principal_investigators,
                self.study_class.keywords,
                self.study_class.classifications,
                self.study_class.study_area_countries,
                self.study_class.geographic_coverages,
                self.study_class.data_access,
                self.study_class.data_collection_copyrights,
                self.study_class.citation_requirements,
                self.study_class.deposit_requirements,
                self.study_class.abstract]

[docs]    @classmethod
    def add_cli_args(cls, parser):
        super().add_cli_args(parser)
        parser.add('--oai-pmh-list-size-ead3',
                   help='How many results should a list response contain for '
                   'EAD3 metadata',
                   default=OAI_RESPONSE_LIST_SIZE,
                   env_var='OPRH_OP_LIST_SIZE_EAD3',
                   type=int)

[docs]    @classmethod
    def configure(cls, settings):
        cls.list_size = settings.oai_pmh_list_size_ead3
        super().configure(settings)

    @GenPlate('get_record.xml', subtemplate='ead3.xml')
    async def get_record(self):
        await super()._get_record()
        return await super()._metadata_response()

    @GenPlate('list_records.xml', subtemplate='ead3.xml')
    async def list_records(self):
        await super()._list_records()
        return await super()._metadata_response()

[docs]    @staticmethod
    def get_daterange_pairs(colldates):
        """Record helper method extracts daterange pairs from a list of
        Study.collection_periods.

        Returns a list of two-tuples [(start, end)]. Both items inside tuple
        are instances of Study.collection_periods values.

        :param list colldates: collection periods list
        :returns: List of date range pairs in two-tuples (start, end)
        :rtype: list
        """
        pairs = []
        cur_pair = {'start': None, 'end': None}

        def _push(start=None, end=None):
            if start is not None:
                if cur_pair['start'] is not None:
                    pairs.append((cur_pair['start'], cur_pair['end']))
                    cur_pair['end'] = None
                cur_pair['start'] = start
                return
            if end is not None:
                if cur_pair['end'] is not None:
                    pairs.append((cur_pair['start'], cur_pair['end']))
                    cur_pair['start'] = None
                cur_pair['end'] = end
                return
            if any(x is not None for x in [cur_pair['start'], cur_pair['end']]):
                pairs.append((cur_pair['start'], cur_pair['end']))

        for colldate in colldates:
            if colldate.attr_event.get_value() == 'start':
                _push(start=colldate)
            elif colldate.attr_event.get_value() == 'end':
                _push(end=colldate)
        _push()
        return pairs

[docs]    @staticmethod
    def get_singledates(colldates):
        """Record helper method extracts single dates from
        a list of Study.collection_periods.

        Returns a list Study.collection_periods values.

        :param list colldates: collection periods list
        :returns: List of single dates
        :rtype: list
        """
        dates = []
        for colldate in colldates:
            if colldate.attr_event.get_value() == 'single':
                dates.append(colldate)
        return dates

    async def _on_record(self, study):
        await super()._on_record(study, get_daterange_pairs=self.get_daterange_pairs,
                                 get_singledates=self.get_singledates)


[docs]class DDICMetadataFormat(MDFormat):

    overridable = True
    mdprefix = 'ddi_c'
    mdschema = 'http://www.ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/codebook.xsd'
    mdnamespace = 'ddi:codebook:2_5'

    @property
    def _record_fields(self):
        return [self.study_class.identifiers,
                self.study_class.document_titles,
                self.study_class.publishers,
                self.study_class.document_uris,
                self.study_class.study_uris,
                self.study_class.distributors,
                self.study_class.copyrights,
                self.study_class.parallel_titles,
                self.study_class.principal_investigators,
                self.study_class.publication_dates,
                self.study_class.publication_years,
                self.study_class.keywords,
                self.study_class.time_methods,
                self.study_class.sampling_procedures,
                self.study_class.collection_modes,
                self.study_class.analysis_units,
                self.study_class.collection_periods,
                self.study_class.classifications,
                self.study_class.abstract,
                self.study_class.study_area_countries,
                self.study_class.universes,
                self.study_class.data_access,
                self.study_class.data_access_descriptions,
                self.study_class.file_names,
                self.study_class.data_collection_copyrights,
                self.study_class.citation_requirements,
                self.study_class.deposit_requirements,
                self.study_class.geographic_coverages,
                self.study_class.instruments,
                self.study_class.related_publications,
                self.study_class.grant_numbers,
                self.study_class.funding_agencies]

[docs]    @classmethod
    def add_cli_args(cls, parser):
        super().add_cli_args(parser)
        parser.add('--oai-pmh-list-size-ddi-c',
                   help='How many results should a list response contain for '
                   'DDI_C metadata',
                   default=OAI_RESPONSE_LIST_SIZE,
                   env_var='OPRH_OP_LIST_SIZE_DDI_C',
                   type=int)

[docs]    @classmethod
    def configure(cls, settings):
        cls.list_size = settings.oai_pmh_list_size_ddi_c
        super().configure(settings)

    @staticmethod
    def iter_relpubls(study):
        relpubls = {}
        for relpubl in study.related_publications:
            key = (relpubl.get_language(), relpubl.attr_description.get_value())
            if key in relpubls:
                relpubls[key].append(relpubl)
            else:
                relpubls.update({key: [relpubl]})
        for lang_desc, relpubls in relpubls.items():
            yield lang_desc, relpubls

    async def _on_record(self, study):
        if self._oai.arguments.verb == self._oai.arguments.verb_value_list_identifiers\
           or study.is_deleted():
            await super()._on_record(study)
            return
        variables, questions = [], {}

        async def _add_question(question):
            varname = question.variable_name.get_value()
            if varname is None:
                return
            if varname in questions:
                questions[varname].append(question)
                return
            questions.update({varname: [question]})

        await QueryController().query_multiple(
            self.variable_class, on_record=variables.append,
            headers=self.corr_id_header,
            _filter={
                self.variable_class.study_number: study.study_number.get_value(),
                self.variable_class._metadata.attr_status: {
                    QueryController.fk_constants.not_equal: REC_STATUS_DELETED}},
            fields=[self.variable_class.variable_name,
                    self.variable_class.variable_labels,
                    self.variable_class.codelist_codes],
            sort_by=self.variable_class._metadata.attr_created)
        await QueryController().query_multiple(
            self.question_class, on_record=_add_question,
            headers=self.corr_id_header,
            _filter={
                self.question_class.study_number: study.study_number.get_value(),
                self.question_class._metadata.attr_status: {
                    QueryController.fk_constants.not_equal: REC_STATUS_DELETED}},
            fields=[self.question_class.question_identifier,
                    self.question_class.question_texts,
                    self.question_class.variable_name],
            sort_by=self.question_class._metadata.attr_created)
        await super()._on_record(study, iter_relpubls=DDICMetadataFormat.iter_relpubls,
                                 variables=variables, questions=questions)

    @GenPlate('get_record.xml', subtemplate='ddi_c.xml')
    async def get_record(self):
        await super()._get_record()
        return await super()._metadata_response()

    @GenPlate('list_records.xml', subtemplate='ddi_c.xml')
    async def list_records(self):
        await super()._list_records()
        return await super()._metadata_response()


[docs]class OAIDDI25MetadataFormat(MDFormat):

    overridable = True
    mdprefix = 'oai_ddi25'
    mdschema = DDICMetadataFormat.mdschema
    mdnamespace = DDICMetadataFormat.mdnamespace

    @property
    def _record_fields(self):
        return [self.study_class.identifiers,
                self.study_class.document_titles,
                self.study_class.publishers,
                self.study_class.document_uris,
                self.study_class.study_uris,
                self.study_class.distributors,
                self.study_class.copyrights,
                self.study_class.parallel_titles,
                self.study_class.principal_investigators,
                self.study_class.publication_dates,
                self.study_class.publication_years,
                self.study_class.keywords,
                self.study_class.time_methods,
                self.study_class.sampling_procedures,
                self.study_class.collection_modes,
                self.study_class.analysis_units,
                self.study_class.collection_periods,
                self.study_class.classifications,
                self.study_class.abstract,
                self.study_class.study_area_countries,
                self.study_class.universes,
                self.study_class.data_access,
                self.study_class.data_access_descriptions,
                self.study_class.file_names,
                self.study_class.data_collection_copyrights,
                self.study_class.citation_requirements,
                self.study_class.deposit_requirements,
                self.study_class.geographic_coverages,
                self.study_class.instruments,
                self.study_class.grant_numbers,
                self.study_class.related_publications,
                self.study_class.funding_agencies]

[docs]    @classmethod
    def add_cli_args(cls, parser):
        super().add_cli_args(parser)
        parser.add('--oai-pmh-list-size-oai-ddi25',
                   help='How many results should a list response contain for '
                   'OAI DDI25 metadata',
                   default=OAI_RESPONSE_LIST_SIZE,
                   env_var='OPRH_OP_LIST_SIZE_OAI_DDI25',
                   type=int)

[docs]    @classmethod
    def configure(cls, settings):
        cls.list_size = settings.oai_pmh_list_size_oai_ddi25
        super().configure(settings)

    async def _on_record(self, study):
        await super()._on_record(study, iter_relpubls=DDICMetadataFormat.iter_relpubls)

    @GenPlate('get_record.xml', subtemplate='oai_ddi25.xml')
    async def get_record(self):
        await super()._get_record()
        return await super()._metadata_response()

    @GenPlate('list_records.xml', subtemplate='oai_ddi25.xml')
    async def list_records(self):
        await super()._list_records()
        return await super()._metadata_response()


[docs]class OAIDataciteMetadataFormat(MDFormat):
    """Metadataformat for OpenAIRE DataCite"""

    overridable = True
    mdprefix = 'oai_datacite'
    mdschema = 'http://schema.datacite.org/meta/kernel-3/metadata.xsd'
    mdnamespace = 'http://datacite.org/schema/kernel-3'

    async def _header_fields(self):
        fields = await super()._header_fields()
        fields.append(self.study_class.identifiers)
        return fields

    @property
    def _record_fields(self):
        return [self.study_class.identifiers,
                self.study_class.principal_investigators,
                self.study_class.distributors,
                self.study_class.publishers,
                self.study_class.publication_years,
                self.study_class.keywords,
                self.study_class.classifications,
                self.study_class.data_access,
                self.study_class.abstract,
                self.study_class.geographic_coverages,
                self.study_class.study_titles,
                self.study_class.related_publications,
                self.study_class.grant_numbers]

[docs]    @classmethod
    def add_cli_args(cls, parser):
        super().add_cli_args(parser)
        parser.add('--oai-pmh-list-size-oai-datacite',
                   help='How many results should a list response contain for '
                   'OAI Datacite metadata',
                   default=OAI_RESPONSE_LIST_SIZE,
                   env_var='OPRH_OP_LIST_SIZE_OAI_DATACITE',
                   type=int)

[docs]    @classmethod
    def configure(cls, settings):
        cls.list_size = settings.oai_pmh_list_size_oai_datacite
        super().configure(settings)

[docs]    @classmethod
    async def get_preferred_identifier(cls, study):
        """OpenAIRE datacite requires a certain type of ID.

        Identifier type must be one of (also the lookup order):
          * DOI
          * ARK
          * Handle
          * PURL
          * URN
          * URL

        :param :obj:`kuha_common.document_store.records.Study` study:
            Currently serialized study.
        :returns: (<str:type>, <str:id>)
        :rtype: tuple
        """
        types_ids = {}
        for identifier in study.identifiers:
            typ = identifier.attr_agency.get_value()
            val = identifier.get_value()
            if typ in const.valid_openaire_id_types and val != types_ids.get(typ, None):
                types_ids.update({typ: val})
        for preferred in const.valid_openaire_id_types:
            if preferred in types_ids:
                return (preferred, types_ids[preferred])
        return ()

[docs]    @staticmethod
    async def get_related_identifiers_types(study):
        """OpenAIRE Datacite requires a certain type of relatedIdentifier.

        :param :obj:`kuha_common.document_store.records.Study` study:
            Currently serialized study.
        :returns: List of two-tuples containing type & id [(<str:type>, <str:id>)]
        :rtype: list
        """
        types_ids = []
        for relpubl in study.related_publications:
            _id = relpubl.attr_identifier.get_value()
            _type = relpubl.attr_identifier_agency.get_value()
            if None in (_type, _id)\
               or (_type, _id) in types_ids\
               or _type not in const.valid_openaire_relid_types:
                # Discard Nones, duplicates and invalid types
                continue
            types_ids.append((_type, _id))
        return types_ids

    @staticmethod
    async def get_publication_year(study):
        for publyear in study.publication_years:
            candidate = publyear.attr_distribution_date.get_value() or publyear.get_value()
            if candidate:
                return candidate[:4] if len(candidate) > 4 else candidate

[docs]    @staticmethod
    async def get_publisher_lang_value_pair(study):
        """Get publisher language & value pair as tuple.

        :param :obj:`kuha_common.document_store.records.Study` study:
            Currently serialized study.
        :returns: (<str:language>, <str:publisher>)
        :rtype: tuple
        """
        # Distributor is the primary source
        candidate = ()
        for distributor in study.distributors:
            if distributor.get_language() == 'en':
                candidate = ('en', distributor.get_value())
                break
            if not candidate:
                candidate = (distributor.get_language(), distributor.get_value())
        if candidate:
            return candidate
        for publisher in study.publishers:
            if publisher.get_language() == 'en':
                candidate = ('en', publisher.get_value())
                break
            if not candidate:
                candidate = (publisher.get_language(), publisher.get_value())
        return candidate

[docs]    @staticmethod
    async def get_funders(study):
        """Get OpenAIRE Datacite funders.

        OpenAIRE Datacite requires a certain nameIdentifier for
        Contributor. The syntax is described at
        https://guidelines.openaire.eu/en/latest/data/field_contributor.html#nameidentifier-ma-o
        This method filters in study.grant_number values that conform
        to the syntax.

        :param :obj:`kuha_common.document_store.records.Study` study: Currently serialized study.
        :returns: list of three-tuples [(<str:language>,
                  <str:nameidentifier>, <str:agency>)]
        :rtype: list
        """
        rval = []
        for grant_no in study.grant_numbers:
            val = grant_no.get_value()
            if val and val.startswith('info:eu-repo/grantAgreement/'):
                rval.append((grant_no.get_language(), val, grant_no.attr_agency.get_value()))
        return rval

    async def _on_record(self, study):
        preferred_id = await self.get_preferred_identifier(study)
        if preferred_id != ():
            # Only add records that have some valid id.
            # For GetRecord, this leads to idDoesNotExist
            # For ListRecords & ListIdentifiers this may lead to false record count,
            # however, ListRecords & ListIdentifiers should use _valid_records_filter() to
            # make sure this will never happen.
            publication_year = await self.get_publication_year(study)
            publisher = await self.get_publisher_lang_value_pair(study)
            related_identifier_types_ids = await self.get_related_identifiers_types(study)
            funders = await self.get_funders(study)
            await super()._on_record(study, preferred_identifier=preferred_id,
                                     publication_year=publication_year,
                                     publisher_lang_val=publisher,
                                     related_identifier_types_ids=related_identifier_types_ids,
                                     funders=funders)

    @GenPlate('get_record.xml', subtemplate='oai_datacite.xml')
    async def get_record(self):
        await super()._get_record()
        return await super()._metadata_response()

    async def _valid_records_filter(self):
        _filter = await super()._valid_records_filter()
        _filter.update({
            self.study_class.identifiers.attr_agency: {
                QueryController.fk_constants.in_: list(const.valid_openaire_id_types)}})
        return _filter

    @GenPlate('list_records.xml', subtemplate='oai_datacite.xml')
    async def list_records(self):
        await super()._list_records()
        return await super()._metadata_response()