#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2025 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
#
"""Mapping profiles for DDI 3.3"""
from itertools import chain
from kuha_common.document_store.mappings.xmlbase import MappedParams
from kuha_common.document_store.mappings.ddi.lifecycle import DDILifecycle32ParserBase
[docs]
class DDI33RecordParser(DDILifecycle32ParserBase):
"""XML Record Parser for DDI 3.3 files
DDI 3.3 introduces OtherMaterialSchemes and uses attributes
controlledVocabularyName and controlledVocabularyURN attributes to
define controlled vocabularies.
"""
#: XML namespaces
NS = {
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xml': 'http://www.w3.org/XML/1998/namespace',
'xhtml': 'http://www.w3.org/1999/xhtml',
'ddi': 'ddi:instance:3_3',
's': 'ddi:studyunit:3_3',
'pd': 'ddi:physicaldataproduct:3_3',
'pi': 'ddi:physicalinstance:3_3',
'c': 'ddi:conceptualcomponent:3_3',
'l': 'ddi:logicalproduct:3_3',
'r': 'ddi:reusable:3_3',
'g': 'ddi:group:3_3',
'dc': 'ddi:datacollection:3_3',
'a': 'ddi:archive:3_3',
}
@property
def _att_cv_name(self):
return "controlledVocabularyName"
@property
def _att_cv_urn(self):
return "controlledVocabularyURN"
def _iter_other_materials_from_study_unit(self, study_unit_element):
for oth_mat_el in self._findall('./r:OtherMaterialScheme/r:OtherMaterial', study_unit_element):
yield oth_mat_el
for oth_mat_el in self._findall_from_elements(
self._find_and_iter_referred_els(
'./r:OtherMaterialSchemeReference', './/r:OtherMaterialScheme', study_unit_element
),
'./r:OtherMaterial',
):
yield oth_mat_el
def _iter_instruments_as_mapped_params(self):
"""Generator yields instruments as mapped parameters
Inclusive lookup locations in order:
DataCollection/Instrument
DataCollection/InstrumentReference
DataCollection/InstrumentScheme/Instrument
DataCollection/InstrumentSchemeReference
DataCollection/CollectionEvent/InstrumentReference
:returns: Generator yielding instruments one by one.
"""
data_coll_els = list(self._iter_data_collections_from_element(self.study_unit_element))
yield from chain.from_iterable(
[
# DataCollection/Instrument
self._iter_instruments_as_mapped_params_from_instrument_els(
self._findall_from_elements(data_coll_els, './dc:Instrument')
),
# DataCollection/InstrumentReference
self._iter_instruments_as_mapped_params_from_instrument_els(
self._find_and_iter_referred_els('./dc:InstrumentReference', './/dc:Instrument', *data_coll_els)
),
# DataCollection/InstrumentScheme/Instrument
self._iter_instruments_as_mapped_params_from_instrument_els(
self._findall_from_elements(data_coll_els, './dc:InstrumentScheme/dc:Instrument')
),
# DataCollection/InstrumentSchemeReference
self._iter_instruments_as_mapped_params_from_instrument_els(
self._findall_from_elements(
self._find_and_iter_referred_els(
'./r:InstrumentSchemeReference', './/dc:InstrumentScheme', *data_coll_els
),
'./dc:Instrument',
)
),
# DataCollection/CollectionEvent/InstrumentReference
self._iter_instruments_as_mapped_params_from_instrument_els(
self._find_and_iter_referred_els(
'./dc:CollectionEvent/dc:InstrumentReference', './/dc:Instrument', *data_coll_els
)
),
]
)
def _iter_other_materials_for_document_uris(self):
if self._ddiinstance_element is None:
return
for oth_mat_el in self._findall(
'./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial', self._ddiinstance_element
):
type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el)
if type_of_material_el is None or ''.join(type_of_material_el.itertext()) != 'Document':
continue
yield oth_mat_el
def _iter_other_materials_for_related_publications(self):
"""Helper iterates OtherMaterial elements.
If one is found from StudyUnit (inline or via reference) yields it
and bypasses DDIInstance/g:ResourcePackage lookup. Otherwise also
traverses DDIInstance/g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial
"""
found = False
def _is_related_publication(oth_mat_el):
type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el)
return (
type_of_material_el is not None
and ''.join(type_of_material_el.itertext()).strip() == 'Related Publication'
)
for oth_mat_el in self._iter_other_materials_from_study_unit(self.study_unit_element):
if _is_related_publication(oth_mat_el):
yield oth_mat_el
found = True
if not found and self._ddiinstance_element is not None:
for oth_mat_el in self._findall(
'./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial', self._ddiinstance_element
):
if _is_related_publication(oth_mat_el):
yield oth_mat_el
def _iter_data_access_descriptions_as_mapped_params(self):
for typeofaccess_el in self._findall(
'./a:Archive/a:ArchiveSpecific/a:Item/a:Access/a:TypeOfAccess', element=self.study_unit_element
):
params = MappedParams(''.join(typeofaccess_el.itertext()))
params.set_language(self._get_xmllang(typeofaccess_el, default=self._study_unit_language))
params.keyword_arguments.update(
{
self._study_cls.data_access_descriptions.attr_element_version.name: typeofaccess_el.get(
self._att_cv_name
)
}
)
yield params
return
yield from super()._iter_data_access_descriptions_as_mapped_params()