#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2022 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
#
"""Mapping profiles for DDI 3.3"""
from itertools import chain
from kuha_common.document_store.mappings.exceptions import (
UnknownXMLRoot,
MappingError
)
from kuha_common.document_store.mappings.xmlbase import (
MappedParams,
str_equals,
fixed_value,
element_remove_whitespaces,
get_preferred_publication_id_agency_pair
)
from kuha_common.document_store.mappings.ddi.ddi31 import DDI31RecordParser
_XPATH_REL_DESC_CONTENT = './r:Description/r:Content'
_XPATH_REL_TYPEOFOBJECT = './r:TypeOfObject'
[docs]class DDI33RecordParser(DDI31RecordParser):
#: XML namespaces
NS = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xml': 'http://www.w3.org/XML/1998/namespace',
'xhtml': 'http://www.w3.org/1999/xhtml',
'ddi': 'ddi:instance:3_3',
's': 'ddi:studyunit:3_3',
'pd': 'ddi:physicaldataproduct:3_3',
'pi': 'ddi:physicalinstance:3_3',
'c': 'ddi:conceptualcomponent:3_3',
'l': 'ddi:logicalproduct:3_3',
'r': 'ddi:reusable:3_3',
'g': 'ddi:group:3_3',
'dc': 'ddi:datacollection:3_3',
'a': 'ddi:archive:3_3'}
def _find_study_unit_element(self, root_element):
expected_fragmentinstance_root = '{%s}FragmentInstance' % (self.NS['ddi'],)
expected_studyunit_root = '{%s}StudyUnit' % (self.NS['s'],)
if self._is_DDIInstance(root_element):
study_unit_elements = list(root_element.iter('{%s}StudyUnit' % (self.NS['s'],)))
study_unit_count = len(study_unit_elements)
if study_unit_count > 1:
# Currently supports only a single s:StudyUnit in xml metadata.
raise MappingError("Unable to parse multiple StudyUnit elements")
if study_unit_count < 1:
raise MappingError("Unable to find StudyUnit element")
self.study_unit_element = study_unit_elements.pop()
elif root_element.tag == expected_fragmentinstance_root:
study_unit_elements = list(self._find_and_iter_referred_els(
'./ddi:TopLevelReference',
'.//s:StudyUnit',
root_element,
lookup_element=root_element))
study_unit_count = len(study_unit_elements)
if study_unit_count > 1:
# Currently supports only a single s:StudyUnit in xml metadata.
raise MappingError("Unable to parse multiple StudyUnit elements")
if study_unit_count < 1:
raise MappingError("Unable to find StudyUnit element")
self.study_unit_element = study_unit_elements.pop()
elif root_element.tag == expected_studyunit_root:
self.study_unit_element = root_element
else:
raise UnknownXMLRoot(root_element.tag,
expected_fragmentinstance_root,
self._DDIInstance_tag(),
expected_studyunit_root)
def _get_reference_urn_from_element(self, element):
"""Look for URN or (Agency, ID, Version) triplet from element's children.
Returns it as an urn (urn:ddi:<agency>:<id>:<version>)
:param element: Look up this element's children.
:returns: URN or empty string if no URN is found.
:rtype: str
"""
urn_el = self._find('./r:URN', element)
if urn_el is not None and urn_el.text not in ('', None):
return ''.join(urn_el.itertext())
# Look for Agency & ID & Version
parts = ''
for xpath in ('./r:Agency', './r:ID', './r:Version'):
part_el = self._find(xpath, element)
if part_el is None:
break
parts += ':%s' % (''.join(part_el.itertext()),)
else:
return 'urn:ddi%s' % (parts,)
return ''
def _iter_reference_values(self, xpath_to_parent, element, *elements):
elements = (element,) + elements
for candidate_el in self._findall_from_elements(elements, xpath_to_parent):
# Look for URN first
urn_str = self._get_reference_urn_from_element(candidate_el)
if urn_str == '':
continue
yield urn_str
def _find_by_reference_value(self, ref_val, xpath, element=None):
candidate_el = self._find('{base}/[r:URN="{ref_id}"]'.format(base=xpath, ref_id=ref_val), element)
if candidate_el is not None:
return candidate_el
agency_val, id_val, version_val = ref_val.split(':')[-3:]
for candidate_el in self._findall('{base}/[r:ID="{ref_id}"]'.format(base=xpath, ref_id=id_val),
element=element):
candidate_agency_el = self._find('./r:Agency', element=candidate_el)
if candidate_agency_el is None or ''.join(candidate_agency_el.itertext()) != agency_val:
continue
candidate_version_el = self._find('./r:Version', element=candidate_el)
if candidate_version_el is None or ''.join(candidate_version_el.itertext()) != version_val:
continue
return candidate_el
def _iter_description_and_lang(self, element, default_lang=None):
for content_el in self._findall(_XPATH_REL_DESC_CONTENT, element):
yield ''.join(content_el.itertext()), self._get_xmllang(content_el, default=default_lang)
def _get_spatial_coverage_from_study_unit(self, study_unit_element):
# StudyUnit may have 0 - 1 Coverage elements.
# Coverage may have 0 - 1 SpatialCoverage/SpatialCoverageReference elements.
# So StudyUnit may only have 0 - 1 SpatialCoverage elements.
ref_el = self._find_by_reference(
self._find('./r:Coverage/r:SpatialCoverageReference/r:URN', study_unit_element),
'.//r:SpatialCoverage')
return ref_el or self._find('./r:Coverage/r:SpatialCoverage', study_unit_element)
def _get_topical_coverage_from_study_unit(self, study_unit_element):
ref_el = self._find_by_reference(
self._find('./r:Coverage/r:TopicalCoverageReference/r:URN', study_unit_element),
'.//r:TopicalCoverage')
return ref_el or self._find('./r:Coverage/r:TopicalCoverage', study_unit_element)
def _get_conceptual_components_from_study_unit(self, study_unit_element):
cc_els = []
for cc_el in self._find_and_iter_referred_els('./r:ConceptualComponentReference',
'.//c:ConceptualComponent',
study_unit_element):
cc_els.append(cc_el)
for cc_el in self._findall('./c:ConceptualComponent', study_unit_element):
if cc_el is not None:
cc_els.append(cc_el)
return cc_els
def _iter_universes_from_conceptual_components(self, conceptual_component_elements):
uni_scheme_els = []
for uni_scheme_el in self._find_and_iter_referred_els('./r:UniverseSchemeReference',
'.//c:UniverseScheme',
*conceptual_component_elements):
uni_scheme_els.append(uni_scheme_el)
for uni_scheme_el in self._findall_from_elements(conceptual_component_elements, './c:UniverseScheme'):
if uni_scheme_el is not None:
uni_scheme_els.append(uni_scheme_el)
for uni_el in self._find_and_iter_referred_els('./c:UniverseReference', './/c:Universe', *uni_scheme_els):
yield uni_el
for uni_el in self._findall_from_elements(uni_scheme_els, './c:Universe'):
if uni_el is not None:
yield uni_el
def _iter_archives_from_study_unit(self, study_unit_element):
for archive_el in self._findall('./a:Archive', study_unit_element):
yield archive_el
for archive_el in self._find_and_iter_referred_els('./r:ArchiveReference', './/a:Archive', study_unit_element):
yield archive_el
def _iter_data_collections_from_study_unit(self, study_unit_element):
for data_coll_el in self._findall('./dc:DataCollection', study_unit_element):
yield data_coll_el
for data_coll_el in self._find_and_iter_referred_els('./r:DataCollectionReference',
'.//dc:DataCollection',
study_unit_element):
yield data_coll_el
def _iter_physical_instances_from_study_unit(self, study_unit_element):
for physical_instance_el in self._findall('./pi:PhysicalInstance', study_unit_element):
yield physical_instance_el
for physical_instance_el in self._find_and_iter_referred_els('./r:PhysicalInstanceReference',
'.//pi:PhysicalInstance',
study_unit_element):
yield physical_instance_el
def _iter_methodologys_from_study_unit(self, study_unit_element):
data_colls = list(self._iter_data_collections_from_study_unit(study_unit_element))
for data_coll_el in data_colls:
methodology_el = self._find('./dc:Methodology', data_coll_el)
if methodology_el is not None:
yield methodology_el
for methodology_el in self._find_and_iter_referred_els('./dc:MethodologyReference',
'.//dc:Methodology',
*data_colls):
yield methodology_el
def _iter_other_materials_from_study_unit(self, study_unit_element):
for oth_mat_el in self._findall('./r:OtherMaterialScheme/r:OtherMaterial', study_unit_element):
yield oth_mat_el
for oth_mat_el in self._findall_from_elements(
self._find_and_iter_referred_els('./r:OtherMaterialSchemeReference',
'.//r:OtherMaterialScheme',
study_unit_element),
'./r:OtherMaterial'):
yield oth_mat_el
def _iter_study_area_countries_from_spatcov_el(self, spatial_coverage_el):
countrycodes_content_els = [{'countrycode': ''.join(cc_el.itertext()),
'content_els': []} for cc_el in
self._findall('./r:CountryCode', spatial_coverage_el)]
content_index = 0
def _add_content_els(content_els):
nonlocal content_index
if content_els == []:
return
if len(countrycodes_content_els) <= content_index:
countrycodes_content_els.append({'countrycode': None,
'content_els': content_els})
else:
countrycodes_content_els[content_index]['content_els'] = content_els
content_index += 1
for geographic_location_ref_el in spatial_coverage_el.findall(
'./r:GeographicLocationReference/r:URN', self.NS):
# This is the primary lookup location as it supports multiple locations.
geographic_location_el = self._find_by_reference(geographic_location_ref_el,
'.//r:GeographicLocation')
_add_content_els(self._findall(_XPATH_REL_DESC_CONTENT, geographic_location_el))
_add_content_els(self._findall(_XPATH_REL_DESC_CONTENT, spatial_coverage_el))
for countrycode_content_els in countrycodes_content_els:
if countrycode_content_els['content_els'] == []:
params = MappedParams(None)
params.set_language(self._get_xmllang(spatial_coverage_el, default=self.root_language))
params.keyword_arguments.update({
self._study_cls.study_area_countries.attr_abbreviation.name:
countrycode_content_els['countrycode']})
yield params
continue
for content_el in countrycode_content_els['content_els']:
params = MappedParams(''.join(content_el.itertext()))
params.set_language(self._get_xmllang(content_el, default=self.root_language))
params.keyword_arguments.update({
self._study_cls.study_area_countries.attr_abbreviation.name:
countrycode_content_els['countrycode']})
yield params
def _iter_groups_from_ddiinstance(self, ddi_instance_el):
for group_el in self._findall('./g:Group', ddi_instance_el):
yield group_el
for group_el in self._find_and_iter_referred_els('./r:GroupReference',
'.//g:Group',
ddi_instance_el):
yield group_el
def _iter_study_area_countries_as_mapped_params(self):
spatial_coverage_el = self._get_spatial_coverage_from_study_unit(self.study_unit_element)
if spatial_coverage_el is None:
return []
return self._iter_study_area_countries_from_spatcov_el(spatial_coverage_el)
def _iter_publishers_as_mapped_params(self):
for publisher_string_el in self._findall('./r:Citation/r:Publisher/r:PublisherName/r:String',
self.study_unit_element):
params = MappedParams(''.join(publisher_string_el.itertext()))
params.set_language(self._get_xmllang(publisher_string_el, default=self._study_unit_language))
yield params
for publisher_ref_el in self._findall('./r:Citation/r:Publisher/r:PublisherReference',
self.study_unit_element):
xpath_to_referenced_el, xpath_to_string_el = {
'Organization': (
'.//a:Organization',
'./a:OrganizationIdentification/a:OrganizationName/r:String'),
'Individual': (
'.//a:Individual',
'./a:IndividualIdentification/a:IndividualName/a:FullName/r:String')
}[''.join(self._find(_XPATH_REL_TYPEOFOBJECT, publisher_ref_el).itertext())]
referenced_el = self._find_by_reference(self._find('./r:URN', publisher_ref_el),
xpath_to_referenced_el)
if referenced_el is None:
continue
for string_el in self._findall(xpath_to_string_el, element=referenced_el):
params = MappedParams(''.join(string_el.itertext()))
params.set_language(self._get_xmllang(string_el, default=self._study_unit_language))
yield params
def _iter_identifiers_as_mapped_params(self):
archive_els = list(self._iter_archives_from_study_unit(self.study_unit_element))
for element in chain.from_iterable((
self._findall_from_elements(archive_els, './a:ArchiveSpecific/a:Collection/a:CallNumber'),
self._findall_from_elements(archive_els, './a:ArchiveSpecific/a:Item/a:CallNumber'))):
value = ''.join(element.itertext())
if not value:
continue
params = MappedParams(value)
params.set_language(self._study_unit_language)
yield params
for inter_ident_el in self._findall('./r:Citation/r:InternationalIdentifier', self.study_unit_element):
params = MappedParams(''.join(self._find('./r:IdentifierContent', element=inter_ident_el).itertext()))
params.set_language(self._study_unit_language)
params.keyword_arguments.update({
self._study_cls.identifiers.attr_agency.name: ''.join(
self._find('./r:ManagingAgency', element=inter_ident_el).itertext())})
yield params
def _iter_principal_investigators_as_mapped_params(self):
def _from_organization(string_el):
params = MappedParams(None)
params.keyword_arguments.update({
self._study_cls.principal_investigators.attr_organization.name: ''.join(string_el.itertext())})
params.set_language(self._get_xmllang(string_el, default=self._study_unit_language))
return params
def _from_individual(string_el):
params = MappedParams(''.join(string_el.itertext()))
params.set_language(self._get_xmllang(string_el, default=self._study_unit_language))
return params
for creator_name_el in self._findall('./r:Citation/r:Creator/r:CreatorName', self.study_unit_element):
for string_el in self._findall('./r:String', creator_name_el):
params = MappedParams(''.join(string_el.itertext()))
params.set_language(self._get_xmllang(string_el, default=self._study_unit_language))
self._get_attr_and_set_param(
params,
self._study_cls.principal_investigators.attr_organization.name,
creator_name_el,
'affiliation')
yield params
for ref_el in self._findall('./r:Citation/r:Creator/r:CreatorReference', self.study_unit_element):
ref_type = ''.join(self._find(_XPATH_REL_TYPEOFOBJECT, ref_el).itertext())
xpath_to_referenced_el, xpath_to_string_el, getter = {
'Organization': (
'.//a:Organization',
'./a:OrganizationIdentification/a:OrganizationName/r:String',
_from_organization),
'Individual': (
'.//a:Individual',
'./a:IndividualIdentification/a:IndividualName/a:FullName/r:String',
_from_individual
)}[ref_type]
referenced_el = self._find_by_reference(self._find('./r:URN', ref_el), xpath_to_referenced_el)
if referenced_el is None:
continue
for string_el in self._findall(xpath_to_string_el, referenced_el):
yield getter(string_el)
def _iter_classifications_as_mapped_params(self):
topcov_el = self._get_topical_coverage_from_study_unit(self.study_unit_element)
for subject_el in self._findall('./r:Subject', topcov_el):
params = MappedParams(None)
params.set_language(self._get_xmllang(subject_el, default=self._study_unit_language))
params.keyword_arguments.update({
self._study_cls.classifications.attr_description.name: ''.join(subject_el.itertext()),
self._study_cls.classifications.attr_system_name.name: subject_el.get('controlledVocabularyName',
None),
self._study_cls.classifications.attr_uri.name: subject_el.get('controlledVocabularyURN', None)
})
yield params
def _iter_keywords_as_mapped_params(self):
topcov_el = self._get_topical_coverage_from_study_unit(self.study_unit_element)
for subject_el in self._findall('./r:Keyword', topcov_el):
params = MappedParams(None)
params.set_language(self._get_xmllang(subject_el, default=self._study_unit_language))
params.keyword_arguments.update({
self._study_cls.classifications.attr_description.name: ''.join(subject_el.itertext()),
self._study_cls.classifications.attr_system_name.name: subject_el.get('controlledVocabularyName',
None),
self._study_cls.classifications.attr_uri.name: subject_el.get('controlledVocabularyURN', None)
})
yield params
def _iter_study_uris_as_mapped_params(self):
"""Generate Study URIs :obj:`MappedParams`
There is no single element to hold the URI that points to
the study description web resource. Lookup multiple locations
in the following order:
1. .//ddi:StudyUnit/a:Archive/a:ArchiveSpecific/a:Collection/a:URI
* will also lookup archive-element by reference
* a:Collection/a:CallNumber CDATA must match study_number
2. .//ddi:StudyUnit/a:Archive/a:ArchiveSpecific/a:Item/a:URI
* will also lookup archive-element by reference
* a:Item/a:CallNumber CDATA must match study_number
3. .//ddi:StudyUnit/r:UserID
* typeOfUserID-attribute must be one of ['DOI', 'URL', 'URLServiceProvider']
"""
def _dict_append(_dct):
def _append(lang, key, value):
valid_keys = ('description', 'location')
if key not in valid_keys:
raise ValueError("Invalid key '%s'. Expecting one of %s"
% (key, ', '.join("'%s'" % (x,) for x in valid_keys)))
if lang not in _dct:
_dct[lang] = {key: value}
elif key not in _dct[lang]:
_dct[lang].update({key: value})
else:
_dct[lang][key] += ' ' + value
return _append
def _from_organization(org_el):
langs_attrs = {}
appender = _dict_append(langs_attrs)
for org_name_str_el in self._findall(
'./a:OrganizationIdentification/a:OrganizationName/r:String', org_el):
cur_lang = self._get_xmllang(org_name_str_el, self._study_unit_language)
cur_str = ''.join(org_name_str_el.itertext())
appender(cur_lang, 'location', cur_str)
for cur_str, cur_lang in self._iter_description_and_lang(org_el, default_lang=self._study_unit_language):
appender(cur_lang, 'description', cur_str)
return langs_attrs
def _from_individual(ind_el):
langs_attrs = {}
appender = _dict_append(langs_attrs)
for ind_name_str_el in self._findall(
'./a:IndividualIdentification/a:IndividualName/a:FullName/r:String', ind_el):
appender(self._get_xmllang(ind_name_str_el, self._study_unit_language),
'location',
''.join(ind_name_str_el.itertext()))
for cur_str, cur_lang in self._iter_description_and_lang(ind_el, default_lang=self._study_unit_language):
appender(cur_lang, 'description', cur_str)
return langs_attrs
for archive_spec_el in self._findall_from_elements(
self._iter_archives_from_study_unit(self.study_unit_element),
'./a:ArchiveSpecific'):
uri_str = None
uri_el = self._find("./a:Collection/[a:CallNumber='{}']/r:URI".format(self.study_number),
archive_spec_el)
if uri_el is None:
uri_el = self._find("./a:Item/[a:CallNumber='{}']/r:URI".format(self.study_number),
archive_spec_el)
if uri_el is not None:
uri_str = ''.join(uri_el.itertext())
arch_org_ref_el = self._find('./a:ArchiveOrganizationReference', archive_spec_el)
if arch_org_ref_el is None:
if uri_el is not None:
params = MappedParams(uri_str)
params.set_language(self._study_unit_language)
yield params
continue
ref_obj_type = ''.join(self._find('./a:ArchiveOrganizationReference/r:TypeOfObject',
archive_spec_el).itertext())
target_xpath, getter = {'Organization': ('.//a:Organization', _from_organization),
'Individual': ('.//a:Individual', _from_individual)}[ref_obj_type]
for referred_el in self._find_and_iter_referred_els('./a:ArchiveOrganizationReference',
target_xpath,
archive_spec_el):
for lang, attrs in getter(referred_el).items():
params = MappedParams(uri_str)
params.set_language(lang)
params.keyword_arguments.update({
self._study_cls.study_uris.attr_location.name: attrs.get('location'),
self._study_cls.study_uris.attr_description.name: attrs.get('description')})
yield params
for userid_el in self._findall('./r:UserID', self.study_unit_element):
if userid_el.get('typeOfUserID') in ('DOI', 'URL', 'URLServiceProvider'):
params = MappedParams(''.join(userid_el.itertext()))
params.set_language(self._get_xmllang(userid_el, default=self._study_unit_language))
yield params
def _iter_universes_as_mapped_params(self):
# ConceptualComponent / ConceptualComponentReference *
# UniverseScheme / UniverseSchemeReference *
# Universe / UniverseReference *
# Actually even UniverseScheme may refer another UniverseScheme. Not going to recurse that deep in this point.
inc_to_bool = str_equals('true', True)
for uni_el in self._iter_universes_from_conceptual_components(
self._get_conceptual_components_from_study_unit(self.study_unit_element)):
included = inc_to_bool(uni_el.get('isInclusive'))
for desc, lang in self._iter_description_and_lang(uni_el, default_lang=self._study_unit_language):
params = MappedParams(desc)
params.set_language(lang)
params.keyword_arguments.update({self._study_cls.universes.attr_included.name: included})
yield params
def __iter_cdata_and_lang_from_element_as_params(self, element, xpath, *xpaths):
"""Helper combines common functionality of data_access lookup"""
content_els = []
for _xpath in (xpath,) + xpaths:
content_els = self._findall(_xpath, element)
if content_els != []:
break
for content_el in content_els:
params = MappedParams(''.join(content_el.itertext()))
params.set_language(self._get_xmllang(content_el, default=self._study_unit_language))
yield params
def _iter_data_access_as_mapped_params(self):
for archive_el in self._iter_archives_from_study_unit(self.study_unit_element):
yield from self.__iter_cdata_and_lang_from_element_as_params(
archive_el,
# Primary lookup xpath
'./a:ArchiveSpecific/a:DefaultAccess/a:Restrictions/r:Content',
# Secondary lookup xpath
'./a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:Restrictions/r:Content')
def _iter_data_access_descriptions_as_mapped_params(self):
for archive_el in self._iter_archives_from_study_unit(self.study_unit_element):
yield from self.__iter_cdata_and_lang_from_element_as_params(
archive_el,
# Primary lookup xpath
'./a:ArchiveSpecific/a:DefaultAccess/a:AccessConditions/r:Content',
# Secondary lookup xpath
'./a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:AccessConditions/r:Content')
def _iter_citation_requirements_as_mapped_params(self):
for archive_el in self._iter_archives_from_study_unit(self.study_unit_element):
yield from self.__iter_cdata_and_lang_from_element_as_params(
archive_el,
# Primary
'./a:ArchiveSpecific/a:DefaultAccess/a:CitationRequirement/r:Content',
# Secondary
'./a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:CitationRequirement/r:Content')
def _iter_deposit_requirements_as_mapped_params(self):
for archive_el in self._iter_archives_from_study_unit(self.study_unit_element):
yield from self.__iter_cdata_and_lang_from_element_as_params(
archive_el,
# Primary
'./a:ArchiveSpecific/a:DefaultAccess/a:DepositRequirement/r:Content',
# Secondary
'./a:ArchiveSpecific/a:Collection/a:DefaultAccess/a:DepositRequirement/r:Content')
def _iter_document_uris_as_mapped_params(self):
ddi_instance_el = self._get_ddiinstance_el()
if ddi_instance_el is not None:
for oth_mat_el in self._findall('./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial',
ddi_instance_el):
type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el)
if type_of_material_el is None or ''.join(type_of_material_el.itertext()) != 'Document':
continue
descriptions = list(self._iter_description_and_lang(
oth_mat_el, default_lang=self._study_unit_language))
for url_el in self._findall('./r:ExternalURLReference', oth_mat_el):
url = ''.join(url_el.itertext())
if not descriptions:
param = MappedParams(url)
param.set_language(self._get_xmllang(url, self._study_unit_language))
yield param
continue
for desc, lang in descriptions:
param = MappedParams(url)
param.set_language(lang)
param.keyword_arguments.update({
self._study_cls.document_uris.attr_description.name: desc})
yield param
@staticmethod
def _get_info_from_typeof_el(type_of_el):
if type_of_el is not None:
value = element_remove_whitespaces(type_of_el)
cv_name = type_of_el.get('controlledVocabularyName')
cv_urn = type_of_el.get('controlledVocabularyURN')
else:
value = None
cv_name = None
cv_urn = None
return value, cv_name, cv_urn
def _iter_time_methods_as_mapped_params(self):
for time_method in self._findall_from_elements(
self._iter_methodologys_from_study_unit(self.study_unit_element),
'./dc:TimeMethod'):
type_of_el = self._find('./dc:TypeOfTimeMethod', time_method)
descs_langs = list(self._iter_description_and_lang(time_method, default_lang=self._study_unit_language))
if (type_of_el, descs_langs) == (None, []):
continue
value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el)
if not descs_langs:
params = MappedParams(value)
params.set_language(self._study_unit_language)
params.keyword_arguments.update({
self._study_cls.time_methods.attr_system_name.name: cv_name,
self._study_cls.time_methods.attr_uri.name: cv_urn})
yield params
for desc, lang in descs_langs:
params = MappedParams(value)
params.set_language(lang)
params.keyword_arguments.update({
self._study_cls.time_methods.attr_description.name: desc,
self._study_cls.time_methods.attr_system_name.name: cv_name,
self._study_cls.time_methods.attr_uri.name: cv_urn})
yield params
def _iter_sampling_procedures_as_mapped_params(self):
for samp_proc_el in self._findall_from_elements(
self._iter_methodologys_from_study_unit(self.study_unit_element),
'./dc:SamplingProcedure'):
type_of_el = self._find('./dc:TypeOfSamplingProcedure', samp_proc_el)
descs_langs = list(self._iter_description_and_lang(samp_proc_el, default_lang=self._study_unit_language))
if (type_of_el, descs_langs) == (None, []):
continue
value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el)
if not descs_langs:
params = MappedParams(value)
params.set_language(self._study_unit_language)
params.keyword_arguments.update({
self._study_cls.sampling_procedures.attr_system_name.name: cv_name,
self._study_cls.sampling_procedures.attr_uri.name: cv_urn})
yield params
for desc, lang in descs_langs:
params = MappedParams(value)
params.set_language(lang)
params.keyword_arguments.update({
self._study_cls.sampling_procedures.attr_description.name: desc,
self._study_cls.sampling_procedures.attr_system_name.name: cv_name,
self._study_cls.sampling_procedures.attr_uri.name: cv_urn})
yield params
def _iter_collection_modes_as_mapped_params(self):
for mode_of_coll_el in self._findall_from_elements(
self._iter_data_collections_from_study_unit(self.study_unit_element),
'./dc:CollectionEvent/dc:ModeOfCollection'):
type_of_el = self._find('./dc:TypeOfModeOfCollection', mode_of_coll_el)
descs_langs = list(self._iter_description_and_lang(
mode_of_coll_el, default_lang=self._study_unit_language))
if (type_of_el, descs_langs) == (None, []):
continue
value, cv_name, cv_urn = self._get_info_from_typeof_el(type_of_el)
if not descs_langs:
params = MappedParams(value)
params.set_language(self._study_unit_language)
params.keyword_arguments.update({
self._study_cls.collection_modes.attr_system_name.name: cv_name,
self._study_cls.collection_modes.attr_uri.name: cv_urn})
yield params
for desc, lang in descs_langs:
params = MappedParams(value)
params.set_language(lang)
params.keyword_arguments.update({
self._study_cls.collection_modes.attr_description.name: desc,
self._study_cls.collection_modes.attr_system_name.name: cv_name,
self._study_cls.collection_modes.attr_uri.name: cv_urn})
yield params
def _iter_instruments_as_mapped_params(self):
"""Generator yields instruments as mapped parameters
Inclusive lookup locations in order:
DataCollection/Instrument
DataCollection/InstrumentReference
DataCollection/InstrumentScheme/Instrument
DataCollection/InstrumentSchemeReference
DataCollection/CollectionEvent/InstrumentReference
:returns: Generator yielding instruments one by one.
"""
def __iter_as_mapped_params(instrument_els):
for instrument_el in instrument_els:
type_of_instru_el = self._find('./dc:TypeOfInstrument', instrument_el)
type_of_instru_value = None if type_of_instru_el is None else\
''.join(self._find('./dc:TypeOfInstrument', instrument_el).itertext())
for string_el in self._findall('./dc:InstrumentName/r:String', instrument_el):
params = MappedParams(type_of_instru_value)
params.set_language(self._get_xmllang(string_el, default=self._study_unit_language))
params.keyword_arguments.update({
self._study_cls.instruments.attr_instrument_name.name: ''.join(string_el.itertext())})
yield params
data_coll_els = list(self._iter_data_collections_from_study_unit(self.study_unit_element))
for param in chain.from_iterable([
# DataCollection/Instrument
__iter_as_mapped_params(self._findall_from_elements(data_coll_els, './dc:Instrument')),
# DataCollection/InstrumentReference
__iter_as_mapped_params(self._find_and_iter_referred_els('./dc:InstrumentReference',
'.//dc:Instrument',
*data_coll_els)),
# DataCollection/InstrumentScheme/Instrument
__iter_as_mapped_params(self._findall_from_elements(
data_coll_els, './dc:InstrumentScheme/dc:Instrument')),
# DataCollection/InstrumentSchemeReference
__iter_as_mapped_params(self._findall_from_elements(
self._find_and_iter_referred_els('./r:InstrumentSchemeReference',
'.//dc:InstrumentScheme',
*data_coll_els),
'./dc:Instrument')),
# DataCollection/CollectionEvent/InstrumentReference
__iter_as_mapped_params(self._find_and_iter_referred_els(
'./dc:CollectionEvent/dc:InstrumentReference',
'.//dc:Instrument',
*data_coll_els))]):
yield param
def _iter_file_names_as_mapped_params(self):
for physical_instance_el in self._iter_physical_instances_from_study_unit(self.study_unit_element):
# CESSDA instructs to maintain datafile languages in citation/language, but there is no way
# to identify the file in that element. Making guesses here.
filelangs = []
for language_el in self._findall('./r:Citation/r:Language', physical_instance_el):
filelangs.append(''.join(language_el.itertext()))
for datafileuri_el in self._findall('./pi:DataFileIdentification/pi:DataFileURI', physical_instance_el):
params = MappedParams(''.join(datafileuri_el.itertext()))
if not filelangs:
params.set_language(self._study_unit_language)
else:
params.set_language(filelangs.pop(0))
yield params
def __iter_params_from_group_properties(self, title_str_els, content_str_els, id_els, uri):
"""Helper digs out parameters from Group element properties"""
ids = [''.join(id_el.itertext()) for id_el in id_els] if id_els != [] else [None]
langs = [self._get_xmllang(element, self._study_unit_language) for element in
chain.from_iterable([title_str_els, content_str_els])]
# Dict comprehension results in unique keys even if langs has duplicates.
mapped_langs_values = {lang: {} for lang in langs}
for title_str_el in title_str_els:
mapped_langs_values[self._get_xmllang(title_str_el, self._study_unit_language)].update({
'title': ''.join(title_str_el.itertext())})
for content_str_el in content_str_els:
mapped_langs_values[self._get_xmllang(content_str_el, self._study_unit_language)].update({
'desc': ''.join(content_str_el.itertext())})
for _id in ids:
for lang, values in mapped_langs_values.items():
params = MappedParams(_id)
params.set_language(lang)
params.keyword_arguments.update({
self._study_cls.study_groups.attr_name.name: values.get('title'),
self._study_cls.study_groups.attr_description.name: values.get('desc'),
self._study_cls.study_groups.attr_uri.name: uri})
yield params
def _iter_study_groups_as_mapped_params(self):
ddi_instance_el = self._get_ddiinstance_el()
if ddi_instance_el:
for group_el in self._iter_groups_from_ddiinstance(ddi_instance_el):
title_str_els = self._findall('./r:Citation/r:Title/r:String', group_el)
content_str_els = self._findall('./r:Abstract/r:Content', group_el)
id_els = self._findall('./r:Citation/r:InternationalIdentifier/r:IdentifierContent', group_el)
uri = group_el.get('externalReferenceDefaultURI')
yield from self.__iter_params_from_group_properties(title_str_els, content_str_els, id_els, uri)
def __iter_other_materials_for_related_publications(self):
"""Helper iterates OtherMaterial elements.
If one is found from StudyUnit (inline or via reference) yields it
and bypasses DDIInstance/g:ResourcePackage lookup. Otherwise also
traverses DDIInstance/g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial
"""
found = False
def _is_related_publication(oth_mat_el):
type_of_material_el = self._find('./r:TypeOfMaterial', oth_mat_el)
return type_of_material_el is not None \
and ''.join(type_of_material_el.itertext()).strip() == 'Related Publication'
for oth_mat_el in self._iter_other_materials_from_study_unit(self.study_unit_element):
if _is_related_publication(oth_mat_el):
yield oth_mat_el
found = True
if not found:
ddi_instance_el = self._get_ddiinstance_el()
if ddi_instance_el:
for oth_mat_el in self._findall('./g:ResourcePackage/r:OtherMaterialScheme/r:OtherMaterial',
ddi_instance_el):
if _is_related_publication(oth_mat_el):
yield oth_mat_el
def _iter_related_publications_as_mapped_params(self):
for oth_mat_el in self.__iter_other_materials_for_related_publications():
title_str_els = self._findall('./r:Citation/r:Title/r:String', oth_mat_el)
desc_str_els = self._findall(_XPATH_REL_DESC_CONTENT, oth_mat_el)
# Kuha Study model supports only single url for a publication. Take the first one.
ext_url_ref_el = self._find('./r:ExternalURLReference', oth_mat_el)
uri = ''.join(ext_url_ref_el.itertext()) if ext_url_ref_el is not None else None
# Kuha Study model supports only single date for publication. Take the first one.
simple_date_el = self._find('./r:Citation/r:PublicationDate/r:SimpleDate', oth_mat_el)
distribution_date = ''.join(simple_date_el.itertext()) if simple_date_el is not None else None
ids_agencys = []
for id_el in self._findall('./r:Citation/r:InternationalIdentifier', oth_mat_el):
ids_agencys.append((
''.join(self._find('./r:IdentifierContent', id_el).itertext()),
''.join(self._find('./r:ManagingAgency', id_el).itertext())))
yield from self._iter_params_from_othmat_properties(
title_str_els, desc_str_els=desc_str_els, uri=uri,
distribution_date=distribution_date,
id_agency_pair=get_preferred_publication_id_agency_pair(ids_agencys))
def _get_role_and_grant_numbers_from_funding_info_el(self, funding_info_el):
funder_role_el = self._find('./r:FunderRole', funding_info_el)
role = ''.join(funder_role_el.itertext()) if funder_role_el is not None else None
grant_numbers = []
for grant_number_el in self._findall('./r:GrantNumber', funding_info_el):
grant_numbers.append(''.join(grant_number_el.itertext()))
return role, grant_numbers
def _iter_funding_agencies_as_mapped_params(self):
for funding_info_el in self._iter_funding_informations_from_study_unit(self.study_unit_element):
role, grant_numbers = self._get_role_and_grant_numbers_from_funding_info_el(funding_info_el)
grant_number = grant_numbers.pop(0) if grant_numbers else None
description = ' '.join([
''.join(elem.itertext()) for elem in self._findall(_XPATH_REL_DESC_CONTENT,
funding_info_el)])\
or None
for agency_org_ref_el in self._findall('./r:AgencyOrganizationReference', funding_info_el):
xpath_to_referenced_el, xpath_to_string_el = {
'Organization': (
'.//a:Organization',
'./a:OrganizationIdentification/a:OrganizationName/r:String'),
'Individual': (
'.//a:Individual',
'./a:IndividualIdentification/a:IndividualName/a:FullName/r:String')
}[''.join(self._find(_XPATH_REL_TYPEOFOBJECT, agency_org_ref_el).itertext())]
urn_str = self._get_reference_urn_from_element(agency_org_ref_el)
referenced_el = self._find_by_reference_value(urn_str, xpath_to_referenced_el)
if referenced_el is None:
# Did not find referenced element from this DDI XML.
continue
for str_el in self._findall(xpath_to_string_el, referenced_el):
params = MappedParams(''.join(str_el.itertext()))
params.set_language(self._get_xmllang(str_el, self.root_language))
params.keyword_arguments.update({
self._study_cls.funding_agencies.attr_grant_number.name: grant_number,
self._study_cls.funding_agencies.attr_role.name: role,
self._study_cls.funding_agencies.attr_description.name: description})
yield params
@property
def _study_maps(self):
return {
'StudyUnit': [
(self._study_cls.add_study_titles, self._map_multi('./r:Citation/r:Title/r:String')),
(self._study_cls.add_parallel_titles, self._map_multi('./r:Citation/r:AlternateTitle/r:String')),
(self._study_cls.add_abstract, self._map_multi('./r:Abstract/r:Content')),
(self._study_cls.add_publication_years, self._map_multi('./r:Citation/r:PublicationDate/r:SimpleDate').
add_attribute(self._study_cls.publication_years.attr_distribution_date.name, self._map_single('.'))),
(self._study_cls.add_analysis_units, self._map_multi('./r:AnalysisUnit').set_value_conversion(
fixed_value(None)).
add_attribute(self._study_cls.analysis_units.attr_description.name,
self._map_single('.')).
add_attribute(self._study_cls.analysis_units.attr_system_name.name,
self._map_single('.', 'controlledVocabularyName')).
add_attribute(self._study_cls.analysis_units.attr_uri.name,
self._map_single('.', 'controlledVocabularyURN'))),
(self._study_cls.add_data_collection_copyrights, self._map_multi('./r:Citation/r:Copyright/r:String'))
],
'DDIInstance': [
(self._study_cls.add_document_titles, self._map_multi('./r:Citation/r:Title/r:String')),
(self._study_cls.add_copyrights, self._map_multi('./r:Citation/r:Copyright/r:String'))]
}
@property
def studies(self):
if self.study_number is None:
self._parse_study_number()
study = self._study_cls()
study.add_study_number(self.study_number_identifier)
self._map_to_record(study, self.study_unit_element, self._study_maps['StudyUnit'],
default_language=self._study_unit_language)
ddi_instance_el = self._get_ddiinstance_el()
if ddi_instance_el is not None:
self._map_to_record(study, ddi_instance_el, self._study_maps['DDIInstance'],
default_language=self._study_unit_language)
for add_func, mapping in [
(study.add_collection_periods, self._iter_collection_periods_as_mapped_params),
(study.add_principal_investigators, self._iter_principal_investigators_as_mapped_params),
(study.add_classifications, self._iter_classifications_as_mapped_params),
(study.add_keywords, self._iter_keywords_as_mapped_params),
(study.add_study_uris, self._iter_study_uris_as_mapped_params),
(study.add_universes, self._iter_universes_as_mapped_params),
(study.add_study_area_countries, self._iter_study_area_countries_as_mapped_params),
(study.add_publishers, self._iter_publishers_as_mapped_params),
(study.add_identifiers, self._iter_identifiers_as_mapped_params),
(study.add_data_access, self._iter_data_access_as_mapped_params),
(study.add_data_access_descriptions, self._iter_data_access_descriptions_as_mapped_params),
(study.add_citation_requirements, self._iter_citation_requirements_as_mapped_params),
(study.add_deposit_requirements, self._iter_deposit_requirements_as_mapped_params),
(study.add_document_uris, self._iter_document_uris_as_mapped_params),
(study.add_time_methods, self._iter_time_methods_as_mapped_params),
(study.add_sampling_procedures, self._iter_sampling_procedures_as_mapped_params),
(study.add_collection_modes, self._iter_collection_modes_as_mapped_params),
(study.add_instruments, self._iter_instruments_as_mapped_params),
(study.add_file_names, self._iter_file_names_as_mapped_params),
(study.add_study_groups, self._iter_study_groups_as_mapped_params),
(study.add_related_publications, self._iter_related_publications_as_mapped_params),
(study.add_funding_agencies, self._iter_funding_agencies_as_mapped_params),
(study.add_grant_numbers, self._iter_grant_numbers_as_mapped_params)]:
for params in mapping():
add_func(*params.arguments, **params.keyword_arguments)
yield study