#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2022 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
#
"""Mapping profiles for DDI-codebook versions.
"""
import logging
from kuha_common.document_store.mappings.exceptions import UnknownXMLRoot
from kuha_common.document_store.mappings.xmlbase import (
MappedParams,
XMLParserBase,
as_valid_identifier,
str_equals,
element_remove_whitespaces,
element_strip_descendant_text,
element_strip_descendant_text_by_tag,
get_preferred_publication_id_agency_pair
)
_logger = logging.getLogger(__name__)
[docs]class DDI122RecordParser(XMLParserBase):
"""Parse Document Store records from DDI 1.2.2. XML.
"""
_expected_root_tag = 'codeBook'
def __init__(self, root_element):
if root_element.tag != self._expected_root_tag:
raise UnknownXMLRoot(root_element.tag, self._expected_root_tag)
super().__init__(root_element)
def _parse_study_number(self):
"""Parse and store study number.
"""
_map = self._map_single('./stdyDscr/citation/titlStmt/IDNo', required=True)
study_number = next(_map(self.root_element, self.root_language, self.NS)).get_value()
self.study_number = study_number
@classmethod
def _iter_params_from_sernames_serinfos(cls, study_group_id, study_group_uri,
study_group_default_lang,
sername_elements, serinfo_elements):
"""Iterate MappedParams for Study.study_groups using serName and serInfo elements.
Common method for DDI122RecordParser and DDI25RecordParser.
:param study_group_id: Study Group ID
:type study_group_id: str
:param study_group_uri: Study Group URI
:type study_group_uri: str
:param study_group_default_lang: Default language of the Study Group
:type study_group_default_lang: str
:param sername_elements: Iterable yielding serName elements
:type sername_elements: iterable
:param serinfo_elements: Iterable yielding serInfo elements
:type serinfo_elements: iterable
:returns: generator yielding MappedParams for Study.study_groups
:rtype: generator
"""
# {<lang>: <description>}
lang_descriptions = {}
for serinfo_el in serinfo_elements:
lang = serinfo_el.attrib.get('{%s}lang' % (XMLParserBase.NS['xml'],), study_group_default_lang)
lang_descriptions[lang] = element_remove_whitespaces(serinfo_el)
for sername_el in sername_elements:
params = MappedParams(study_group_id)
lang = sername_el.attrib.get('{%s}lang' % (XMLParserBase.NS['xml'],), study_group_default_lang)
params.set_language(lang)
params.keyword_arguments.update({
cls._study_cls.study_groups.attr_name.name: "".join(sername_el.itertext()),
cls._study_cls.study_groups.attr_uri.name: study_group_uri,
# Add description if same language. Pop() so we may later add the ones that are left.
cls._study_cls.study_groups.attr_description.name: lang_descriptions.pop(lang, None)})
yield params
for lang, description in lang_descriptions.items():
# Add descriptions which have no other values.
params = MappedParams(study_group_id)
params.set_language(lang)
params.keyword_arguments.update({
cls._study_cls.study_groups.attr_description.name: description,
cls._study_cls.study_groups.attr_uri.name: study_group_uri
})
yield params
def _iter_study_study_groups_as_mapped_params(self):
"""Iterate mapped study groups
Use serStmt@ID to get MappedParams value for study_group. Use serName to get study_group
name and serName@xml:lang to get language. Use serInfo to get study_group description
and compare language in serInfo@xml:lang with serName@xml:lang, if they differ construct
a new MappedParams with that language.
DDI 2.5 documentation on <serStmt>:
Series statement for the work at the appropriate level: marked-up document;
marked-up document source; study; study description, other material; other
material for study. The URI attribute is provided to point to a central Internet
repository of series information. Repeat this field if the study is part of more
than one series. Repetition of the internal content should be used to support
multiple languages only.
- https://ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/field_level_documentation.html
:returns: generator iterating mapped Study.study_groups
"""
default_language = self.root_language
for serstmt_element in self._findall('./stdyDscr/citation/serStmt'):
study_group_id = serstmt_element.get('ID', None)
study_group_uri = serstmt_element.get('URI', None)
study_group_default_lang = serstmt_element.attrib.get('{%s}lang' % (self.NS['xml'],), default_language)
for params in self._iter_params_from_sernames_serinfos(
study_group_id,
study_group_uri,
study_group_default_lang,
self._findall('./serName', serstmt_element),
self._findall('./serInfo', serstmt_element)):
yield params
def _iter_related_publications_as_mapped_params(self):
for relpubl_element in self._findall('./stdyDscr/othrStdyMat/relPubl'):
description = element_strip_descendant_text(relpubl_element)
lang = relpubl_element.attrib.get('{%s}lang' % (self.NS['xml'],), self.root_language)
citation_elements = self._findall('./citation', relpubl_element)
if citation_elements == []:
params = MappedParams(None)
params.set_language(lang)
params.keyword_arguments[self._study_cls.related_publications.attr_description.name] = description
yield params
continue
for citation_element in citation_elements:
title_element = self._find('./titlStmt/titl', citation_element)
if title_element is None:
_logger.warning("No titlStmt/titl inside relPubl/citation. "
"File is invalid DDI and no related publications "
"can be parsed.")
continue
title = ''.join(title_element.itertext())
params = MappedParams(title)
params.set_language(lang)
# IDNo is repeatable inside titlStmt, but Kuha Study model does not support
# multiple IDs for single title.
ids_agencys = []
for idno_el in self._findall('./titlStmt/IDNo', citation_element):
ids_agencys.append((
''.join(idno_el.itertext()),
idno_el.attrib.get('agency')))
if ids_agencys:
identifier, identifier_agency = get_preferred_publication_id_agency_pair(ids_agencys)
params.keyword_arguments.update({
self._study_cls.related_publications.attr_identifier.name: identifier,
self._study_cls.related_publications.attr_identifier_agency.name: identifier_agency
})
# distStmt may actually have multiple distDate-elements. Study model does not
# support repetition inside contained element, so we can only support one
# distDate for each relpubl.
self._get_attr_and_set_param(params,
self._study_cls.related_publications.attr_distribution_date.name,
self._find('./distStmt/distDate', citation_element),
'date')
# citation may have multiple holdings-elements. Study models does not support
# repetition inside contained element, so we can only support one uri for
# each relpubl
self._get_attr_and_set_param(params,
self._study_cls.related_publications.attr_uri.name,
self._find('./holdings', citation_element),
'URI')
params.keyword_arguments[self._study_cls.related_publications.attr_description.name] = description
yield params
@property
def _study_maps(self):
return [
(self._study_cls.add_identifiers, self._map_multi('./stdyDscr/citation/titlStmt/IDNo').
add_attribute(self._study_cls.identifiers.attr_agency.name, self._map_single('.', 'agency'))),
(self._study_cls.add_study_titles, self._map_multi('./stdyDscr/citation/titlStmt/titl')),
(self._study_cls.add_document_titles, self._map_multi('./docDscr/citation/titlStmt/titl')),
(self._study_cls.add_parallel_titles, self._map_multi('./stdyDscr/citation/titlStmt/parTitl')),
(self._study_cls.add_principal_investigators, self._map_multi('./stdyDscr/citation/rspStmt/AuthEnty').
set_value_getter(element_strip_descendant_text_by_tag(
'%sExtLink' % ('{%s}' % (self.NS.get('ddi'),) if self.NS.get('ddi') else ''))).
add_attribute(self._study_cls.principal_investigators.attr_organization.name,
self._map_single('.', 'affiliation')).
add_attribute(self._study_cls.principal_investigators.attr_external_link.name,
self._map_single('./ExtLink')).
add_attribute(self._study_cls.principal_investigators.attr_external_link_role.name,
self._map_single('./ExtLink', 'role')).
add_attribute(self._study_cls.principal_investigators.attr_external_link_title.name,
self._map_single('./ExtLink', 'title')).
add_attribute(self._study_cls.principal_investigators.attr_external_link_uri.name,
self._map_single('./ExtLink', 'URI'))),
(self._study_cls.add_publishers, self._map_multi('./docDscr/citation/prodStmt/producer').
add_attribute(self._study_cls.publishers.attr_abbreviation.name, self._map_single('.', 'abbr'))),
(self._study_cls.add_data_collection_copyrights,
self._map_multi('./stdyDscr/citation/prodStmt/copyright')),
(self._study_cls.add_document_uris, self._map_multi('./docDscr/citation/holdings', 'URI').
add_attribute(self._study_cls.document_uris.attr_location.name, self._map_single('.', 'location')).
add_attribute(self._study_cls.document_uris.attr_description.name, self._map_single('.'))),
(self._study_cls.add_study_uris, self._map_multi('./stdyDscr/citation/holdings', 'URI').
add_attribute(self._study_cls.document_uris.attr_location.name, self._map_single('.', 'location')).
add_attribute(self._study_cls.document_uris.attr_description.name, self._map_single('.'))),
(self._study_cls.add_distributors, self._map_multi('./stdyDscr/citation/distStmt/distrbtr').
add_attribute(self._study_cls.distributors.attr_abbreviation.name, self._map_single('.', 'abbr')).
add_attribute(self._study_cls.distributors.attr_uri.name, self._map_single('.', 'URI'))),
(self._study_cls.add_publication_dates, self._map_multi('./stdyDscr/citation/verStmt/version', 'date')),
(self._study_cls.add_publication_years, self._map_multi('./stdyDscr/citation/prodStmt/prodDate').
add_attribute(self._study_cls.publication_years.attr_distribution_date.name,
self._map_single('./stdyDscr/citation/distStmt/distDate', 'date'),
False)),
(self._study_cls.add_abstract, self._map_multi('./stdyDscr/stdyInfo/abstract').
set_value_getter(element_remove_whitespaces)),
(self._study_cls.add_classifications, self._map_multi('./stdyDscr/stdyInfo/subject/topcClas', 'ID').
add_attribute(self._study_cls.classifications.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.classifications.attr_uri.name, self._map_single('.', 'vocabURI')).
add_attribute(self._study_cls.classifications.attr_description.name, self._map_single('.'))),
(self._study_cls.add_keywords, self._map_multi('./stdyDscr/stdyInfo/subject/keyword', 'ID').
add_attribute(self._study_cls.keywords.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.keywords.attr_uri.name, self._map_single('.', 'vocabURI')).
add_attribute(self._study_cls.keywords.attr_description.name, self._map_single('.'))),
(self._study_cls.add_collection_periods, self._map_multi('./stdyDscr/stdyInfo/sumDscr/collDate', 'date').
add_attribute(self._study_cls.collection_periods.attr_event.name, self._map_single('.', 'event'))),
(self._study_cls.add_study_area_countries, self._map_multi('./stdyDscr/stdyInfo/sumDscr/nation').
add_attribute(self._study_cls.study_area_countries.attr_abbreviation.name,
self._map_single('.', 'abbr'))),
(self._study_cls.add_geographic_coverages, self._map_multi('./stdyDscr/stdyInfo/sumDscr/geogCover')),
(self._study_cls.add_analysis_units, self._map_multi('./stdyDscr/stdyInfo/sumDscr/anlyUnit/concept').
add_attribute(self._study_cls.analysis_units.attr_description.name,
self._map_single('/..', localizable=True).
set_value_getter(element_strip_descendant_text), provides_main_lang=True).
add_attribute(self._study_cls.analysis_units.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.analysis_units.attr_uri.name, self._map_single('.', 'vocabURI'))),
(self._study_cls.add_universes, self._map_multi('./stdyDscr/stdyInfo/sumDscr/universe').
add_attribute(self._study_cls.universes.attr_included.name, self._map_single('.', 'clusion').
set_value_conversion(str_equals('I', default=True)))),
(self._study_cls.add_data_kinds, self._map_multi('./stdyDscr/stdyInfo/sumDscr/dataKind')),
(self._study_cls.add_time_methods, self._map_multi('./stdyDscr/method/dataColl/timeMeth/concept').
add_attribute(self._study_cls.time_methods.attr_description.name,
self._map_single('/..', localizable=True).
set_value_getter(element_strip_descendant_text), provides_main_lang=True).
add_attribute(self._study_cls.time_methods.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.time_methods.attr_uri.name, self._map_single('.', 'vocabURI'))),
(self._study_cls.add_sampling_procedures, self._map_multi('./stdyDscr/method/dataColl/sampProc/concept').
add_attribute(self._study_cls.sampling_procedures.attr_description.name,
self._map_single('/..', localizable=True).
set_value_getter(element_strip_descendant_text), provides_main_lang=True).
add_attribute(self._study_cls.sampling_procedures.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.sampling_procedures.attr_uri.name, self._map_single('.', 'vocabURI'))),
(self._study_cls.add_collection_modes, self._map_multi('./stdyDscr/method/dataColl/collMode/concept').
add_attribute(self._study_cls.collection_modes.attr_description.name,
self._map_single('/..', localizable=True).
set_value_getter(element_strip_descendant_text), provides_main_lang=True).
add_attribute(self._study_cls.collection_modes.attr_system_name.name, self._map_single('.', 'vocab')).
add_attribute(self._study_cls.collection_modes.attr_uri.name, self._map_single('.', 'vocabURI'))),
(self._study_cls.add_data_access, self._map_multi('./stdyDscr/dataAccs/useStmt/restrctn')),
(self._study_cls.add_citation_requirements, self._map_multi('./stdyDscr/dataAccs/useStmt/citReq')),
(self._study_cls.add_deposit_requirements, self._map_multi('./stdyDscr/dataAccs/useStmt/deposReq')),
(self._study_cls.add_data_access_descriptions, self._map_multi('./stdyDscr/dataAccs/useStmt/conditions')),
(self._study_cls.add_file_names, self._map_multi('./fileDscr/fileTxt/fileName')),
(self._study_cls.add_instruments,
self._map_multi('./stdyDscr/othrStdyMat/relMat/citation/titlStmt/IDNo').
add_attribute(self._study_cls.instruments.attr_instrument_name.name, self._map_single('/../titl'))),
(self._study_cls.add_copyrights, self._map_multi('./docDscr/citation/prodStmt/copyright')),
(self._study_cls.add_funding_agencies, self._map_multi('./stdyDscr/citation/prodStmt/fundAg').
add_attribute(self._study_cls.funding_agencies.attr_abbreviation.name, self._map_single('.', 'abbr')).
add_attribute(self._study_cls.funding_agencies.attr_role.name, self._map_single('.', 'role'))),
(self._study_cls.add_grant_numbers, self._map_multi('./stdyDscr/citation/prodStmt/grantNo').
add_attribute(self._study_cls.grant_numbers.attr_agency.name, self._map_single('.', 'agency')).
add_attribute(self._study_cls.grant_numbers.attr_role.name, self._map_single('.', 'role')))
]
@property
def studies(self):
"""Parse XML to create and populate :obj:`kuha_common.document_store.records.Study`.
:returns: Generator to Populate Document Store Study record.
"""
if self.study_number is None:
self._parse_study_number()
study = self._study_cls()
study.add_study_number(self.study_number_identifier)
self._map_to_record(study, self.root_element, self._study_maps)
for mapped_study_group in self._iter_study_study_groups_as_mapped_params():
study.add_study_groups(*mapped_study_group.arguments,
**mapped_study_group.keyword_arguments)
for mapped_related_publications in self._iter_related_publications_as_mapped_params():
study.add_related_publications(*mapped_related_publications.arguments,
**mapped_related_publications.keyword_arguments)
yield study
@property
def _variable_maps(self):
return [
(self._variable_cls.add_variable_name, self._map_single('.', 'name', required=True).
set_value_conversion(as_valid_identifier)),
(self._variable_cls.add_question_identifiers, self._map_multi('./qstn', 'ID', localizable=False)
.set_value_conversion(as_valid_identifier)),
(self._variable_cls.add_variable_labels, self._map_multi('./labl')),
(self._variable_cls.add_codelist_codes, self._map_multi('./catgry', 'ID').
add_attribute(self._variable_cls.codelist_codes.attr_label.name,
self._map_multi('./labl'), provides_main_lang=True).
add_attribute(self._variable_cls.codelist_codes.attr_missing.name, self._map_single('.', 'missing').
set_value_conversion(str_equals('Y', False)))),
]
@property
def variables(self):
"""Parse XML to create and populate multiple
:obj:`kuha_common.document_store.records.Variable` instances.
:returns: Generator to populate multiple Document Store Variable records.
"""
if self.study_number is None:
self._parse_study_number()
maps = self._variable_maps
for var_element in self._findall('./dataDscr/var'):
variable = self._variable_cls()
variable.add_study_number(self.study_number_identifier)
self._map_to_record(variable, var_element, maps)
yield variable
@property
def _question_maps(self):
return [
(self._question_cls.add_question_identifier, self._map_single('.', 'ID', required=True)
.set_value_conversion(as_valid_identifier)),
(self._question_cls.add_question_texts, self._map_multi('./qstnLit'))
]
@property
def questions(self):
"""Parse XML to create and populate multiple
:obj:`kuha_common.document_store.records.Question` instances.
:returns: Generator to populate multiple Document Store Question records.
"""
if self.study_number is None:
self._parse_study_number()
research_instruments = []
resinstru_map = self._map_multi('./stdyDscr/method/dataColl/resInstru')\
.set_value_getter(element_remove_whitespaces)
for instru_params in resinstru_map(self.root_element, self.root_language, self.NS):
research_instruments.append(instru_params.arguments)
maps = self._question_maps
for var_element in self._findall('./dataDscr/var'):
variable_name = as_valid_identifier(var_element.get('name'))
codes = []
for code_params in self._map_multi('./code')(var_element, self.root_language, self.NS):
codes.append(code_params.arguments)
for qstn_element in self._findall('./qstn', var_element):
question = self._question_cls()
question.add_study_number(self.study_number_identifier)
if variable_name:
question.add_variable_name(variable_name)
for instru in research_instruments:
question.add_research_instruments(*instru)
for _code in codes:
question.add_codelist_references(*_code)
self._map_to_record(question, qstn_element, maps)
yield question
@property
def _study_group_maps(self):
return [
(self._studygroup_cls.add_study_group_identifier, self._map_single('.', 'ID', required=True)
.set_value_conversion(as_valid_identifier)),
(self._studygroup_cls.add_uris, self._map_single('.', 'URI', localizable=True)),
(self._studygroup_cls.add_study_group_names, self._map_multi('./serName')),
(self._studygroup_cls.add_descriptions, self._map_multi('./serInfo')
.set_value_getter(element_remove_whitespaces))
]
@property
def study_groups(self):
"""Parse XML to create and populate multiple
:obj:`kuha_common.document_store.records.StudyGroup` instances.
:returns: Generator to populate multiple Document Store StudyGroup records.
"""
study_groups = []
def update_or_append_study_group(study_group):
for primary_study_group in study_groups:
if primary_study_group.updates(study_group):
return
study_groups.append(study_group)
if self.study_number is None:
self._parse_study_number()
maps = self._study_group_maps
for serstmt_element in self._findall('./stdyDscr/citation/serStmt'):
study_group = self._studygroup_cls()
study_group.add_study_numbers(self.study_number_identifier)
self._map_to_record(study_group, serstmt_element, maps)
update_or_append_study_group(study_group)
for study_group in study_groups:
yield study_group
[docs]class DDI122NesstarRecordParser(DDI122RecordParser):
_expected_root_tag = '{http://www.icpsr.umich.edu/DDI}codeBook'
[docs]class DDI25RecordParser(DDI122RecordParser):
"""Parse Document Store records from DDI 2.5 XML.
"""
#: XML namespaces
NS = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'ddi': 'ddi:codebook:2_5',
'xml': 'http://www.w3.org/XML/1998/namespace'}
_expected_root_tag = '{ddi:codebook:2_5}codeBook'
@classmethod
def _xpath(cls, xpath):
"""Every xpath is prepended with namespace."""
prepended = ''
for index, step in enumerate(xpath.split('/')):
if step in ('', '.', '..'):
prepended += '%s' % (step,) if index == 0 else '/%s' % (step,)
continue
prepended += 'ddi:%s' % (step,) if index == 0 else '/ddi:%s' % (step,)
return super()._xpath(prepended)