Source code for kuha_common.document_store.mappings.ddi.ddi33

#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2022 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
#
"""Mapping profiles for DDI 3.3"""

from itertools import chain
from kuha_common.document_store.mappings.exceptions import (
    UnknownXMLRoot,
    MappingError
)
from kuha_common.document_store.mappings.xmlbase import (
    MappedParams,
    str_equals,
    fixed_value,
    element_remove_whitespaces,
    get_preferred_publication_id_agency_pair
)
from kuha_common.document_store.mappings.ddi.ddi31 import DDI31RecordParser


_XPATH_REL_DESC_CONTENT = './r:Description/r:Content'
_XPATH_REL_TYPEOFOBJECT = './r:TypeOfObject'


[docs]class DDI33RecordParser(DDI31RecordParser): #: XML namespaces NS = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'xml': 'http://www.w3.org/XML/1998/namespace', 'xhtml': 'http://www.w3.org/1999/xhtml', 'ddi': 'ddi:instance:3_3', 's': 'ddi:studyunit:3_3', 'pd': 'ddi:physicaldataproduct:3_3', 'pi': 'ddi:physicalinstance:3_3', 'c': 'ddi:conceptualcomponent:3_3', 'l': 'ddi:logicalproduct:3_3', 'r': 'ddi:reusable:3_3', 'g': 'ddi:group:3_3', 'dc': 'ddi:datacollection:3_3', 'a': 'ddi:archive:3_3'} def _find_study_unit_element(self, root_element): expected_fragmentinstance_root = '{%s}FragmentInstance' % (self.NS['ddi'],) expected_studyunit_root = '{%s}StudyUnit' % (self.NS['s'],) if self._is_DDIInstance(root_element): study_unit_elements = list(root_element.iter('{%s}StudyUnit' % (self.NS['s'],))) study_unit_count = len(study_unit_elements) if study_unit_count > 1: # Currently supports only a single s:StudyUnit in xml metadata. raise MappingError("Unable to parse multiple StudyUnit elements") if study_unit_count < 1: raise MappingError("Unable to find StudyUnit element") self.study_unit_element = study_unit_elements.pop() elif root_element.tag == expected_fragmentinstance_root: study_unit_elements = list(self._find_and_iter_referred_els( './ddi:TopLevelReference', './/s:StudyUnit', root_element, lookup_element=root_element)) study_unit_count = len(study_unit_elements) if study_unit_count > 1: # Currently supports only a single s:StudyUnit in xml metadata. raise MappingError("Unable to parse multiple StudyUnit elements") if study_unit_count < 1: raise MappingError("Unable to find StudyUnit element") self.study_unit_element = study_unit_elements.pop() elif root_element.tag == expected_studyunit_root: self.study_unit_element = root_element else: raise UnknownXMLRoot(root_element.tag, expected_fragmentinstance_root, self._DDIInstance_tag(), expected_studyunit_root) def _get_reference_urn_from_element(self, element): """Look for URN or (Agency, ID, Version) triplet from element's children. Returns it as an urn (urn:ddi:<agency>:<id>:<version>) :param element: Look up this element's children. :returns: URN or empty string if no URN is found. :rtype: str """ urn_el = self._find('./r:URN', element) if urn_el is not None and urn_el.text not in ('', None): return ''.join(urn_el.itertext()) # Look for Agency & ID & Version parts = '' for xpath in ('./r:Agency', './r:ID', './r:Version'): part_el = self._find(xpath, element) if part_el is None: break parts += ':%s' % (''.join(part_el.itertext()),) else: return 'urn:ddi%s' % (parts,) return '' def _iter_reference_values(self, xpath_to_parent, element, *elements): elements = (element,) + elements for candidate_el in self._findall_from_elements(elements, xpath_to_parent): # Look for URN first urn_str = self._get_reference_urn_from_element(candidate_el) if urn_str == '': continue yield urn_str def _find_by_reference_value(self, ref_val, xpath, element=None): candidate_el = self._find('{base}/[r:URN="{ref_id}"]'.format(base=xpath, ref_id=ref_val), element) if candidate_el is not None: return candidate_el agency_val, id_val, version_val = ref_val.split(':')[-3:] for candidate_el in self._findall('{base}/[r:ID="{ref_id}"]'.format(base=xpath, ref_id=id_val), element=element): candidate_agency_el = self._find('./r:Agency', element=candidate_el) if candidate_agency_el is None or ''.join(candidate_agency_el.itertext()) != agency_val: continue candidate_version_el = self._find('./r:Version', element=candidate_el) if candidate_version_el is None or ''.join(candidate_version_el.itertext()) != version_val: continue return candidate_el def _iter_description_and_lang(self, element, default_lang=None): for content_el in self._findall(_XPATH_REL_DESC_CONTENT, element): yield ''.join(content_el.itertext()), self._get_xmllang(content_el, default=default_lang) def _get_spatial_coverage_from_study_unit(self, study_unit_element): # StudyUnit may have 0 - 1 Coverage elements. # Coverage may have 0 - 1 SpatialCoverage/SpatialCoverageReference elements. # So StudyUnit may only have 0 - 1 SpatialCoverage elements. ref_el = self._find_by_reference( self._find('./r:Coverage/r:SpatialCoverageReference/r:URN', study_unit_element), './/r:SpatialCoverage') return ref_el or self._find('./r:Coverage/r:SpatialCoverage', study_unit_element) def _get_topical_coverage_from_study_unit(self, study_unit_element): ref_el = self._find_by_reference( self._find('./r:Coverage/r:TopicalCoverageReference/r:URN', study_unit_element), './/r:TopicalCoverage') return ref_el or self._find('./r:Coverage/r:TopicalCoverage', study_unit_element) def _get_conceptual_components_from_study_unit(self, study_unit_element): cc_els = [] for cc_el in self._find_and_iter_referred_els('./r:ConceptualComponentReference', './/c:ConceptualComponent', study_unit_element): cc_els.append(cc_el) for cc_el in self._findall('./c:ConceptualComponent', study_unit_element): if cc_el is not None: cc_els.append(cc_el) return cc_els def _iter_universes_from_conceptual_components(self, conceptual_component_elements): uni_scheme_els = [] for uni_scheme_el in self._find_and_iter_referred_els('./r:UniverseSchemeReference', './/c:UniverseScheme', *conceptual_component_elements): uni_scheme_els.append(uni_scheme_el) for uni_scheme_el in self._findall_from_elements(conceptual_component_elements, './c:UniverseScheme'): if uni_scheme_el is not None: uni_scheme_els.append(uni_scheme_el) for uni_el in self._find_and_iter_referred_els('./c:UniverseReference', './/c:Universe', *uni_scheme_els): yield uni_el for uni_el in self._findall_from_elements(uni_scheme_els, './c:Universe'): if uni_el is not None: yield uni_el def _iter_archives_from_study_unit(self, study_unit_element): for archive_el in self._findall('./a:Archive', study_unit_element): yield archive_el for archive_el in self._find_and_iter_referred_els('./r:ArchiveReference', './/a:Archive', study_unit_element): yield archive_el def _iter_data_collections_from_study_unit(self, study_unit_element): for data_coll_el in self._findall('./dc:DataCollection', study_unit_element): yield data_coll_el for data_coll_el in self._find_and_iter_referred_els('./r:DataCollectionReference', './/dc:DataCollection', study_unit_element): yield data_coll_el def _iter_physical_instances_from_study_unit(self, study_unit_element): for physical_instance_el in self._findall('./pi:PhysicalInstance', study_unit_element): yield physical_instance_el for physical_instance_el in self._find_and_iter_referred_els('./r:PhysicalInstanceReference', './/pi:PhysicalInstance', study_unit_element): yield physical_instance_el def _iter_methodologys_from_study_unit(self, study_unit_element): data_colls = list(self._iter_data_collections_from_study_unit(study_unit_element)) for data_coll_el in data_colls: methodology_el = self._find('./dc:Methodology', data_coll_el) if methodology_el is not None: yield methodology_el for methodology_el in self._find_and_iter_referred_els('./dc:MethodologyReference', './/dc:Methodology', *data_colls): yield methodology_el def _iter_other_materials_from_study_unit(self, study_unit_element): for oth_mat_el in self._findall('./r:OtherMaterialScheme/r:OtherMaterial', study_unit_element): yield oth_mat_el for oth_mat_el in self._findall_from_elements( self._find_and_iter_referred_els('./r:OtherMaterialSchemeReference', './/r:OtherMaterialScheme', study_unit_element), './r:OtherMaterial'): yield oth_mat_el def _iter_study_area_countries_from_spatcov_el(self, spatial_coverage_el): countrycodes_content_els = [{'countrycode': ''.join(cc_el.itertext()), 'content_els': []} for cc_el in self._findall('./r:CountryCode', spatial_coverage_el)] content_index = 0 def _add_content_els(content_els): nonlocal content_index if content_els == []: return if len(countrycodes_content_els) <= content_index: countrycodes_content_els.append({'countrycode': None, 'content_els': content_els}) else: countrycodes_content_els[content_index]['content_els'] = content_els content_index += 1 for geographic_location_ref_el in spatial_coverage_el.findall( './r:GeographicLocationReference/r:URN', self.NS): # This is the primary lookup location as it supports multiple locations. geographic_location_el = self._find_by_reference(geographic_location_ref_el, './/r:GeographicLocation') _add_content_els(self._findall(_XPATH_REL_DESC_CONTENT, geographic_location_el)) _add_content_els(self._findall(_XPATH_REL_DESC_CONTENT, spatial_coverage_el)) for countrycode_content_els in countrycodes_content_els: if countrycode_content_els['content_els'] == []: params = MappedParams(None) params.set_language(self._get_xmllang(spatial_coverage_el, default=self.root_language)) params.keyword_arguments.update({ self._study_cls.study_area_countries.attr_abbreviation.name: countrycode_content_els['countrycode']}) yield params continue for content_el in countrycode_content_els['content_els']: params = MappedParams(''.join(content_el.itertext())) params.set_language(self._get_xmllang(content_el, default=self.root_language)) params.keyword_arguments.update({ self._study_cls.study_area_countries.attr_abbreviation.name: countrycode_content_els['countrycode']}) yield params def _iter_groups_from_ddiinstance(self, ddi_instance_el): for group_el in self._findall('./g:Group', ddi_instance_el): yield group_el for group_el in self._find_and_iter_referred_els('./r:GroupReference', './/g:Group', ddi_instance_el): yield group_el def _iter_study_area_countries_as_mapped_params(self): spatial_coverage_el = self._get_spatial_coverage_from_study_unit(self.study_unit_element) if spatial_coverage_el is None: return [] return self._iter_study_area_countries_from_spatcov_el(spatial_coverage_el) def _iter_publishers_as_mapped_params(self): for publisher_string_el in self._findall('./r:Citation/r:Publisher/r:PublisherName/r:String', self.study_unit_element): params = MappedParams(''.join(publisher_string_el.itertext())) params.set_language(self._get_xmllang(publisher_string_el, default=self._study_unit_language)) yield params for publisher_ref_el in self._findall('./r:Citation/r:Publisher/r:PublisherReference', self.study_unit_element): xpath_to_referenced_el, xpath_to_string_el = { 'Organization': ( './/a:Organization', './a:OrganizationIdentification/a:OrganizationName/r:String'), 'Individual': ( './/a:Individual', './a:IndividualIdentification/a:IndividualName/a:FullName/r:String') }[''.join(self._find(_XPATH_REL_TYPEOFOBJECT, publisher_ref_el).itertext())] referenced_el = self._find_by_reference(self._find('./r:URN', publisher_ref_el), xpath_to_referenced_el) if referenced_el is None: continue for string_el in self._findall(xpath_to_string_el, element=referenced_el): params = MappedParams(''.join(string_el.itertext())) params.set_language(self._get_xmllang(string_el, default=self._study_unit_language)) yield params def _iter_identifiers_as_mapped_params(self): archive_els = list(self._iter_archives_from_study_unit(self.study_unit_element)) for element in chain.from_iterable(( self._findall_from_elements(archive_els, './a:ArchiveSpecific/a:Collection/a:CallNumber'), self._findall_from_elements(archive_els, './a:ArchiveSpecific/a:Item/a:CallNumber'))): value = ''.join(element.itertext()) if not value: continue params = MappedParams(value) params.set_language(self._study_unit_language) yield params for inter_ident_el in self._findall('./r:Citation/r:InternationalIdentifier', self.study_unit_element): params = MappedParams(''.join(self._find('./r:IdentifierContent', element=inter_ident_el).itertext())) params.set_language(self._study_unit_language) params.keyword_arguments.update({ self._study_cls.identifiers.attr_agency.name: ''.join( self._find('./r:ManagingAgency', element=inter_ident_el).itertext())}) yield params def _iter_principal_investigators_as_mapped_params(self): def _from_organization(string_el): params = MappedParams(None) params.keyword_arguments.update({ self._study_cls.principal_investigators.attr_organization.name: ''.join(string_el.itertext())}) params.set_language(self._get_xmllang(string_el, default=self._study_unit_language)) return params def _from_individual(string_el): params = MappedParams(''.join(string_el.itertext())) params.set_language(self._get_xmllang(string_el, default=self._study_unit_language)) return params for creator_name_el in self._findall('./r:Citation/r:Creator/r:CreatorName', self.study_unit_element): for string_el in self._findall('./r:String', creator_name_el): params = MappedParams(''.join(string_el.itertext())) params.set_language(self._get_xmllang(string_el, default=self._study_unit_language)) self._get_attr_and_set_param( params, self._study_cls.principal_investigators.attr_organization.name, creator_name_el, 'affiliation') yield params for ref_el in self._findall('./r:Citation/r:Creator/r:CreatorReference', self.study_unit_element): ref_type = ''.join(self._find(_XPATH_REL_TYPEOFOBJECT, ref_el).itertext()) xpath_to_referenced_el, xpath_to_string_el, getter = { 'Organization': ( './/a:Organization', './a:OrganizationIdentification/a:OrganizationName/r:String', _from_organization), 'Individual': ( './/a:Individual', './a:IndividualIdentification/a:IndividualName/a:FullName/r:String', _from_individual )}[ref_type] referenced_el = self._find_by_reference(self._find('./r:URN', ref_el), xpath_to_referenced_el) if referenced_el is None: continue for string_el in self._findall(xpath_to_string_el, referenced_el): yield getter(string_el) def _iter_classifications_as_mapped_params(self): topcov_el = self._get_topical_coverage_from_study_unit(self.study_unit_element) for subject_el in self._findall('./r:Subject', topcov_el): params = MappedParams(None) params.set_language(self._get_xmllang(subject_el, default=self._study_unit_language)) params.keyword_arguments.update({ self._study_cls.classifications.attr_description.name: ''.join(subject_el.itertext()), self._study_cls.classifications.attr_system_name.name: subject_el.get('controlledVocabularyName', None), self._study_cls.classifications.attr_uri.name: subject_el.get('controlledVocabularyURN', None) }) yield params def _iter_keywords_as_mapped_params(self): topcov_el = self._get_topical_coverage_from_study_unit(self.study_unit_element) for subject_el in self._findall('./r:Keyword', topcov_el): params = MappedParams(None) params.set_language(self._get_xmllang(subject_el, default=self._study_unit_language)) params.keyword_arguments.update({ self._study_cls.classifications.attr_description.name: ''.join(subject_el.itertext()), self._study_cls.classifications.attr_system_name.name: subject_el.get('controlledVocabularyName', None), self._study_cls.classifications.attr_uri.name: subject_el.get('controlledVocabularyURN', None) }) yield params def _iter_study_uris_as_mapped_params(self): """Generate Study URIs :obj:`MappedParams` There is no single element to hold the URI that points to the study description web resource. Lookup multiple locations in the following order: 1. .//ddi:StudyUnit/a:Archive/a:ArchiveSpecific/a:Collection/a:URI * will also lookup archive-element by reference * a:Collection/a:CallNumber CDATA must match study_number 2. .//ddi:StudyUnit/a:Archive/a:ArchiveSpecific/a:Item/a:URI * will also lookup archive-element by reference * a:Item/a:CallNumber CDATA must match study_number 3. .//ddi:StudyUnit/r:UserID * typeOfUserID-attribute must be one of ['DOI', 'URL', 'URLServiceProvider'] """ def _dict_append(_dct): def _append(lang, key, value): valid_keys = ('description', 'location') if key not in valid_keys: raise ValueError("Invalid key '%s'. Expecting one of %s" % (key, ', '.join("'%s'" % (x,) for x in valid_keys))) if lang not in _dct: _dct[lang] = {key: value} elif key not in _dct[lang]: _dct[lang].update({key: value}) else: _dct[lang][key] += ' ' + value return _append def _from_organization(org_el): langs_attrs = {} appender = _dict_append(langs_attrs) for org_name_str_el in self._findall( './a:OrganizationIdentification/a:OrganizationName/r:String', org_el): cur_lang = self._get_xmllang(org_name_str_el, self._study_unit_language) cur_str = ''.join(org_name_str_el.itertext()) appender(cur_lang, 'location', cur_str) for cur_str, cur_lang in self._iter_description_and_lang(org_el, default_lang=self._study_unit_language): appender(cur_lang, 'description', cur_str) return langs_attrs def _from_individual(ind_el): langs_attrs = {} appender = _dict_append(langs_attrs) for ind_name_str_el in self._findall( './a:IndividualIdentification/a:IndividualName/a:FullName/r:String', ind_el): appender(self._get_xmllang(ind_name_str_el, self._study_unit_language), 'location', ''.join(ind_name_str_el.itertext())) for cur_str, cur_lang in self._iter_description_and_lang(ind_el, default_lang=self._study_unit_language): appender(cur_lang, 'description', cur_str) return langs_attrs for archive_spec_el in self._findall_from_elements( self._iter_archives_from_study_unit(self.study_unit_element), './a:ArchiveSpecific'): uri_str = None uri_el = self._find("./a:Collection/[a:CallNumber='{}']/r:URI".format(self.study_number), archive_spec_el) if uri_el is None: uri_el = self._find("./a:Item/[a:CallNumber='{}']/r:URI".format(self.study_number), archive_spec_el) if uri_el is not None: uri_str = ''.join(uri_el.itertext()) arch_org_ref_el = self._find('./a:ArchiveOrganizationReference', archive_spec_el) if arch_org_ref_el is None: if uri_el is not None: params = MappedParams(uri_str) params.set_language(self._study_unit_language) yield params continue ref_obj_type = ''.join(self._find('./a:ArchiveOrganizationReference/r:TypeOfObject', archive_spec_el).itertext()) target_xpath, getter = {'Organization': ('.//a:Organization', _from_organization), 'Individual': ('.//a:Individual', _from_individual)}[ref_obj_type] for referred_el in self._find_and_iter_referred_els('./a:ArchiveOrganizationReference', target_xpath, archive_spec_el): for lang, attrs in getter(referred_el).items(): params = MappedParams(uri_str) params.set_language(lang) params.keyword_arguments.update({ self._study_cls.study_uris.attr_location.name: attrs.get('location'), self._study_cls.study_uris.attr_description.name: attrs.get('description')}) yield params for userid_el in self._findall('./r:UserID', self.study_unit_element): if userid_el.get('typeOfUserID') in ('DOI', 'URL', 'URLServiceProvider'): params = MappedParams(''.join(userid_el.itertext())) params.set_language(self._get_xmllang(userid_el, default=self._study_unit_language)) yield params def _iter_universes_as_mapped_params(self): # ConceptualComponent / ConceptualComponentReference * # UniverseScheme / UniverseSchemeReference * # Universe / UniverseReference * # Actually even UniverseScheme may refer another UniverseScheme. Not going to recurse that deep in this point. inc_to_bool = str_equals('true', True) for uni_el in self._iter_universes_from_conceptual_components( self._get_conceptual_components_from_study_unit(self.study_unit_element)): included = inc_to_bool(uni_el.get('isInclusive')) for desc, lang in self._iter_description_and_lang(uni_el, default_lang=self._study_unit_language): params = MappedParams(desc) params.set_language(lang) params.keyword_arguments.update({self._study_cls.universes.attr_included.name: included}) yield params def __iter_cdata_and_lang_from_element_as_params(self, element, xpath, *xpaths): """Helper combines common functionality of data_access lookup""" content_els = [] for _xpath in (xpath,) + xpaths: content_els = self._findall(_xpath, element) if content_els != []: break for content_el in content_els: params = MappedParams(''.join(content_el.itertext())) params.set_language(self._get_xmllang(content_el, default=self._study_unit_language)) yield params def _iter_data_access_as_mapped_params(self): for archive_el in self._iter_archives_from_study_unit(self.study_unit_element): yield from self.__iter_cdata_and_lang_from_element_as_params( archive_el, # Primary lookup xpath './a:ArchiveSpecific/a:DefaultAccess/a:Restrictions/r:Content', # Secondary lookup xpath './a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:Restrictions/r:Content') def _iter_data_access_descriptions_as_mapped_params(self): for archive_el in self._iter_archives_from_study_unit(self.study_unit_element): yield from self.__iter_cdata_and_lang_from_element_as_params( archive_el, # Primary lookup xpath './a:ArchiveSpecific/a:DefaultAccess/a:AccessConditions/r:Content', # Secondary lookup xpath './a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:AccessConditions/r:Content') def _iter_citation_requirements_as_mapped_params(self): for archive_el in self._iter_archives_from_study_unit(self.study_unit_element): yield from self.__iter_cdata_and_lang_from_element_as_params( archive_el, # Primary './a:ArchiveSpecific/a:DefaultAccess/a:CitationRequirement/r:Content', # Secondary './a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:CitationRequirement/r:Content') def _iter_deposit_requirements_as_mapped_params(self): for archive_el in self._iter_archives_from_study_unit(self.study_unit_element): yield from self.__iter_cdata_and_lang_from_element_as_params( archive_el, # Primary './a:ArchiveSpecific/a:DefaultAccess/a:DepositRequirement/r:Content', # Secondary './a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:DepositRequirement/r:Content') def _iter_document_uris_as_mapped_params(self): ddi_instance_el = self._get_ddiinstance_el() if ddi_instance_el is not None: for oth_mat_el in self._findall('./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial', ddi_instance_el): type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el) if type_of_material_el is None or ''.join(type_of_material_el.itertext()) != 'Document': continue descriptions = list(self._iter_description_and_lang( oth_mat_el, default_lang=self._study_unit_language)) for url_el in self._findall('./r:ExternalURLReference', oth_mat_el): url = ''.join(url_el.itertext()) if not descriptions: param = MappedParams(url) param.set_language(self._get_xmllang(url, self._study_unit_language)) yield param continue for desc, lang in descriptions: param = MappedParams(url) param.set_language(lang) param.keyword_arguments.update({ self._study_cls.document_uris.attr_description.name: desc}) yield param @staticmethod def _get_info_from_typeof_el(type_of_el): if type_of_el is not None: value = element_remove_whitespaces(type_of_el) cv_name = type_of_el.get('controlledVocabularyName') cv_urn = type_of_el.get('controlledVocabularyURN') else: value = None cv_name = None cv_urn = None return value, cv_name, cv_urn def _iter_time_methods_as_mapped_params(self): for time_method in self._findall_from_elements( self._iter_methodologys_from_study_unit(self.study_unit_element), './dc:TimeMethod'): type_of_el = self._find('./dc:TypeOfTimeMethod', time_method) descs_langs = list(self._iter_description_and_lang(time_method, default_lang=self._study_unit_language)) if (type_of_el, descs_langs) == (None, []): continue value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el) if not descs_langs: params = MappedParams(value) params.set_language(self._study_unit_language) params.keyword_arguments.update({ self._study_cls.time_methods.attr_system_name.name: cv_name, self._study_cls.time_methods.attr_uri.name: cv_urn}) yield params for desc, lang in descs_langs: params = MappedParams(value) params.set_language(lang) params.keyword_arguments.update({ self._study_cls.time_methods.attr_description.name: desc, self._study_cls.time_methods.attr_system_name.name: cv_name, self._study_cls.time_methods.attr_uri.name: cv_urn}) yield params def _iter_sampling_procedures_as_mapped_params(self): for samp_proc_el in self._findall_from_elements( self._iter_methodologys_from_study_unit(self.study_unit_element), './dc:SamplingProcedure'): type_of_el = self._find('./dc:TypeOfSamplingProcedure', samp_proc_el) descs_langs = list(self._iter_description_and_lang(samp_proc_el, default_lang=self._study_unit_language)) if (type_of_el, descs_langs) == (None, []): continue value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el) if not descs_langs: params = MappedParams(value) params.set_language(self._study_unit_language) params.keyword_arguments.update({ self._study_cls.sampling_procedures.attr_system_name.name: cv_name, self._study_cls.sampling_procedures.attr_uri.name: cv_urn}) yield params for desc, lang in descs_langs: params = MappedParams(value) params.set_language(lang) params.keyword_arguments.update({ self._study_cls.sampling_procedures.attr_description.name: desc, self._study_cls.sampling_procedures.attr_system_name.name: cv_name, self._study_cls.sampling_procedures.attr_uri.name: cv_urn}) yield params def _iter_collection_modes_as_mapped_params(self): for mode_of_coll_el in self._findall_from_elements( self._iter_data_collections_from_study_unit(self.study_unit_element), './dc:CollectionEvent/dc:ModeOfCollection'): type_of_el = self._find('./dc:TypeOfModeOfCollection', mode_of_coll_el) descs_langs = list(self._iter_description_and_lang( mode_of_coll_el, default_lang=self._study_unit_language)) if (type_of_el, descs_langs) == (None, []): continue value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el) if not descs_langs: params = MappedParams(value) params.set_language(self._study_unit_language) params.keyword_arguments.update({ self._study_cls.collection_modes.attr_system_name.name: cv_name, self._study_cls.collection_modes.attr_uri.name: cv_urn}) yield params for desc, lang in descs_langs: params = MappedParams(value) params.set_language(lang) params.keyword_arguments.update({ self._study_cls.collection_modes.attr_description.name: desc, self._study_cls.collection_modes.attr_system_name.name: cv_name, self._study_cls.collection_modes.attr_uri.name: cv_urn}) yield params def _iter_instruments_as_mapped_params(self): """Generator yields instruments as mapped parameters Inclusive lookup locations in order: DataCollection/Instrument DataCollection/InstrumentReference DataCollection/InstrumentScheme/Instrument DataCollection/InstrumentSchemeReference DataCollection/CollectionEvent/InstrumentReference :returns: Generator yielding instruments one by one. """ def __iter_as_mapped_params(instrument_els): for instrument_el in instrument_els: type_of_instru_el = self._find('./dc:TypeOfInstrument', instrument_el) type_of_instru_value = None if type_of_instru_el is None else\ ''.join(self._find('./dc:TypeOfInstrument', instrument_el).itertext()) for string_el in self._findall('./dc:InstrumentName/r:String', instrument_el): params = MappedParams(type_of_instru_value) params.set_language(self._get_xmllang(string_el, default=self._study_unit_language)) params.keyword_arguments.update({ self._study_cls.instruments.attr_instrument_name.name: ''.join(string_el.itertext())}) yield params data_coll_els = list(self._iter_data_collections_from_study_unit(self.study_unit_element)) for param in chain.from_iterable([ # DataCollection/Instrument __iter_as_mapped_params(self._findall_from_elements(data_coll_els, './dc:Instrument')), # DataCollection/InstrumentReference __iter_as_mapped_params(self._find_and_iter_referred_els('./dc:InstrumentReference', './/dc:Instrument', *data_coll_els)), # DataCollection/InstrumentScheme/Instrument __iter_as_mapped_params(self._findall_from_elements( data_coll_els, './dc:InstrumentScheme/dc:Instrument')), # DataCollection/InstrumentSchemeReference __iter_as_mapped_params(self._findall_from_elements( self._find_and_iter_referred_els('./r:InstrumentSchemeReference', './/dc:InstrumentScheme', *data_coll_els), './dc:Instrument')), # DataCollection/CollectionEvent/InstrumentReference __iter_as_mapped_params(self._find_and_iter_referred_els( './dc:CollectionEvent/dc:InstrumentReference', './/dc:Instrument', *data_coll_els))]): yield param def _iter_file_names_as_mapped_params(self): for physical_instance_el in self._iter_physical_instances_from_study_unit(self.study_unit_element): # CESSDA instructs to maintain datafile languages in citation/language, but there is no way # to identify the file in that element. Making guesses here. filelangs = [] for language_el in self._findall('./r:Citation/r:Language', physical_instance_el): filelangs.append(''.join(language_el.itertext())) for datafileuri_el in self._findall('./pi:DataFileIdentification/pi:DataFileURI', physical_instance_el): params = MappedParams(''.join(datafileuri_el.itertext())) if not filelangs: params.set_language(self._study_unit_language) else: params.set_language(filelangs.pop(0)) yield params def __iter_params_from_group_properties(self, title_str_els, content_str_els, id_els, uri): """Helper digs out parameters from Group element properties""" ids = [''.join(id_el.itertext()) for id_el in id_els] if id_els != [] else [None] langs = [self._get_xmllang(element, self._study_unit_language) for element in chain.from_iterable([title_str_els, content_str_els])] # Dict comprehension results in unique keys even if langs has duplicates. mapped_langs_values = {lang: {} for lang in langs} for title_str_el in title_str_els: mapped_langs_values[self._get_xmllang(title_str_el, self._study_unit_language)].update({ 'title': ''.join(title_str_el.itertext())}) for content_str_el in content_str_els: mapped_langs_values[self._get_xmllang(content_str_el, self._study_unit_language)].update({ 'desc': ''.join(content_str_el.itertext())}) for _id in ids: for lang, values in mapped_langs_values.items(): params = MappedParams(_id) params.set_language(lang) params.keyword_arguments.update({ self._study_cls.study_groups.attr_name.name: values.get('title'), self._study_cls.study_groups.attr_description.name: values.get('desc'), self._study_cls.study_groups.attr_uri.name: uri}) yield params def _iter_study_groups_as_mapped_params(self): ddi_instance_el = self._get_ddiinstance_el() if ddi_instance_el: for group_el in self._iter_groups_from_ddiinstance(ddi_instance_el): title_str_els = self._findall('./r:Citation/r:Title/r:String', group_el) content_str_els = self._findall('./r:Abstract/r:Content', group_el) id_els = self._findall('./r:Citation/r:InternationalIdentifier/r:IdentifierContent', group_el) uri = group_el.get('externalReferenceDefaultURI') yield from self.__iter_params_from_group_properties(title_str_els, content_str_els, id_els, uri) def __iter_other_materials_for_related_publications(self): """Helper iterates OtherMaterial elements. If one is found from StudyUnit (inline or via reference) yields it and bypasses DDIInstance/g:ResourcePackage lookup. Otherwise also traverses DDIInstance/g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial """ found = False def _is_related_publication(oth_mat_el): type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el) return type_of_material_el is not None \ and ''.join(type_of_material_el.itertext()).strip() == 'Related Publication' for oth_mat_el in self._iter_other_materials_from_study_unit(self.study_unit_element): if _is_related_publication(oth_mat_el): yield oth_mat_el found = True if not found: ddi_instance_el = self._get_ddiinstance_el() if ddi_instance_el: for oth_mat_el in self._findall('./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial', ddi_instance_el): if _is_related_publication(oth_mat_el): yield oth_mat_el def _iter_related_publications_as_mapped_params(self): for oth_mat_el in self.__iter_other_materials_for_related_publications(): title_str_els = self._findall('./r:Citation/r:Title/r:String', oth_mat_el) desc_str_els = self._findall(_XPATH_REL_DESC_CONTENT, oth_mat_el) # Kuha Study model supports only single url for a publication. Take the first one. ext_url_ref_el = self._find('./r:ExternalURLReference', oth_mat_el) uri = ''.join(ext_url_ref_el.itertext()) if ext_url_ref_el is not None else None # Kuha Study model supports only single date for publication. Take the first one. simple_date_el = self._find('./r:Citation/r:PublicationDate/r:SimpleDate', oth_mat_el) distribution_date = ''.join(simple_date_el.itertext()) if simple_date_el is not None else None ids_agencys = [] for id_el in self._findall('./r:Citation/r:InternationalIdentifier', oth_mat_el): ids_agencys.append(( ''.join(self._find('./r:IdentifierContent', id_el).itertext()), ''.join(self._find('./r:ManagingAgency', id_el).itertext()))) yield from self._iter_params_from_othmat_properties( title_str_els, desc_str_els=desc_str_els, uri=uri, distribution_date=distribution_date, id_agency_pair=get_preferred_publication_id_agency_pair(ids_agencys)) def _get_role_and_grant_numbers_from_funding_info_el(self, funding_info_el): funder_role_el = self._find('./r:FunderRole', funding_info_el) role = ''.join(funder_role_el.itertext()) if funder_role_el is not None else None grant_numbers = [] for grant_number_el in self._findall('./r:GrantNumber', funding_info_el): grant_numbers.append(''.join(grant_number_el.itertext())) return role, grant_numbers def _iter_funding_agencies_as_mapped_params(self): for funding_info_el in self._iter_funding_informations_from_study_unit(self.study_unit_element): role, grant_numbers = self._get_role_and_grant_numbers_from_funding_info_el(funding_info_el) grant_number = grant_numbers.pop(0) if grant_numbers else None description = ' '.join([ ''.join(elem.itertext()) for elem in self._findall(_XPATH_REL_DESC_CONTENT, funding_info_el)])\ or None for agency_org_ref_el in self._findall('./r:AgencyOrganizationReference', funding_info_el): xpath_to_referenced_el, xpath_to_string_el = { 'Organization': ( './/a:Organization', './a:OrganizationIdentification/a:OrganizationName/r:String'), 'Individual': ( './/a:Individual', './a:IndividualIdentification/a:IndividualName/a:FullName/r:String') }[''.join(self._find(_XPATH_REL_TYPEOFOBJECT, agency_org_ref_el).itertext())] urn_str = self._get_reference_urn_from_element(agency_org_ref_el) referenced_el = self._find_by_reference_value(urn_str, xpath_to_referenced_el) if referenced_el is None: # Did not find referenced element from this DDI XML. continue for str_el in self._findall(xpath_to_string_el, referenced_el): params = MappedParams(''.join(str_el.itertext())) params.set_language(self._get_xmllang(str_el, self.root_language)) params.keyword_arguments.update({ self._study_cls.funding_agencies.attr_grant_number.name: grant_number, self._study_cls.funding_agencies.attr_role.name: role, self._study_cls.funding_agencies.attr_description.name: description}) yield params @property def _study_maps(self): return { 'StudyUnit': [ (self._study_cls.add_study_titles, self._map_multi('./r:Citation/r:Title/r:String')), (self._study_cls.add_parallel_titles, self._map_multi('./r:Citation/r:AlternateTitle/r:String')), (self._study_cls.add_abstract, self._map_multi('./r:Abstract/r:Content')), (self._study_cls.add_publication_years, self._map_multi('./r:Citation/r:PublicationDate/r:SimpleDate'). add_attribute(self._study_cls.publication_years.attr_distribution_date.name, self._map_single('.'))), (self._study_cls.add_analysis_units, self._map_multi('./r:AnalysisUnit').set_value_conversion( fixed_value(None)). add_attribute(self._study_cls.analysis_units.attr_description.name, self._map_single('.')). add_attribute(self._study_cls.analysis_units.attr_system_name.name, self._map_single('.', 'controlledVocabularyName')). add_attribute(self._study_cls.analysis_units.attr_uri.name, self._map_single('.', 'controlledVocabularyURN'))), (self._study_cls.add_data_collection_copyrights, self._map_multi('./r:Citation/r:Copyright/r:String')) ], 'DDIInstance': [ (self._study_cls.add_document_titles, self._map_multi('./r:Citation/r:Title/r:String')), (self._study_cls.add_copyrights, self._map_multi('./r:Citation/r:Copyright/r:String'))] } @property def studies(self): if self.study_number is None: self._parse_study_number() study = self._study_cls() study.add_study_number(self.study_number_identifier) self._map_to_record(study, self.study_unit_element, self._study_maps['StudyUnit'], default_language=self._study_unit_language) ddi_instance_el = self._get_ddiinstance_el() if ddi_instance_el is not None: self._map_to_record(study, ddi_instance_el, self._study_maps['DDIInstance'], default_language=self._study_unit_language) for add_func, mapping in [ (study.add_collection_periods, self._iter_collection_periods_as_mapped_params), (study.add_principal_investigators, self._iter_principal_investigators_as_mapped_params), (study.add_classifications, self._iter_classifications_as_mapped_params), (study.add_keywords, self._iter_keywords_as_mapped_params), (study.add_study_uris, self._iter_study_uris_as_mapped_params), (study.add_universes, self._iter_universes_as_mapped_params), (study.add_study_area_countries, self._iter_study_area_countries_as_mapped_params), (study.add_publishers, self._iter_publishers_as_mapped_params), (study.add_identifiers, self._iter_identifiers_as_mapped_params), (study.add_data_access, self._iter_data_access_as_mapped_params), (study.add_data_access_descriptions, self._iter_data_access_descriptions_as_mapped_params), (study.add_citation_requirements, self._iter_citation_requirements_as_mapped_params), (study.add_deposit_requirements, self._iter_deposit_requirements_as_mapped_params), (study.add_document_uris, self._iter_document_uris_as_mapped_params), (study.add_time_methods, self._iter_time_methods_as_mapped_params), (study.add_sampling_procedures, self._iter_sampling_procedures_as_mapped_params), (study.add_collection_modes, self._iter_collection_modes_as_mapped_params), (study.add_instruments, self._iter_instruments_as_mapped_params), (study.add_file_names, self._iter_file_names_as_mapped_params), (study.add_study_groups, self._iter_study_groups_as_mapped_params), (study.add_related_publications, self._iter_related_publications_as_mapped_params), (study.add_funding_agencies, self._iter_funding_agencies_as_mapped_params), (study.add_grant_numbers, self._iter_grant_numbers_as_mapped_params)]: for params in mapping(): add_func(*params.arguments, **params.keyword_arguments) yield study