Source code for kuha_common.document_store.mappings.ddi.codebook

#!/usr/bin/env python3
# Author(s): Toni Sissala
# Copyright 2022 Finnish Social Science Data Archive FSD / University of Tampere
# Licensed under the EUPL. See LICENSE.txt for full license.
#
"""Mapping profiles for DDI-codebook versions.
"""
import logging
from kuha_common.document_store.mappings.exceptions import UnknownXMLRoot
from kuha_common.document_store.mappings.xmlbase import (
    MappedParams,
    XMLParserBase,
    as_valid_identifier,
    str_equals,
    element_remove_whitespaces,
    element_strip_descendant_text,
    element_strip_descendant_text_by_tag,
    get_preferred_publication_id_agency_pair
)


_logger = logging.getLogger(__name__)


[docs]class DDI122RecordParser(XMLParserBase): """Parse Document Store records from DDI 1.2.2. XML. """ _expected_root_tag = 'codeBook' def __init__(self, root_element): if root_element.tag != self._expected_root_tag: raise UnknownXMLRoot(root_element.tag, self._expected_root_tag) super().__init__(root_element) def _parse_study_number(self): """Parse and store study number. """ _map = self._map_single('./stdyDscr/citation/titlStmt/IDNo', required=True) study_number = next(_map(self.root_element, self.root_language, self.NS)).get_value() self.study_number = study_number @classmethod def _iter_params_from_sernames_serinfos(cls, study_group_id, study_group_uri, study_group_default_lang, sername_elements, serinfo_elements): """Iterate MappedParams for Study.study_groups using serName and serInfo elements. Common method for DDI122RecordParser and DDI25RecordParser. :param study_group_id: Study Group ID :type study_group_id: str :param study_group_uri: Study Group URI :type study_group_uri: str :param study_group_default_lang: Default language of the Study Group :type study_group_default_lang: str :param sername_elements: Iterable yielding serName elements :type sername_elements: iterable :param serinfo_elements: Iterable yielding serInfo elements :type serinfo_elements: iterable :returns: generator yielding MappedParams for Study.study_groups :rtype: generator """ # {<lang>: <description>} lang_descriptions = {} for serinfo_el in serinfo_elements: lang = serinfo_el.attrib.get('{%s}lang' % (XMLParserBase.NS['xml'],), study_group_default_lang) lang_descriptions[lang] = element_remove_whitespaces(serinfo_el) for sername_el in sername_elements: params = MappedParams(study_group_id) lang = sername_el.attrib.get('{%s}lang' % (XMLParserBase.NS['xml'],), study_group_default_lang) params.set_language(lang) params.keyword_arguments.update({ cls._study_cls.study_groups.attr_name.name: "".join(sername_el.itertext()), cls._study_cls.study_groups.attr_uri.name: study_group_uri, # Add description if same language. Pop() so we may later add the ones that are left. cls._study_cls.study_groups.attr_description.name: lang_descriptions.pop(lang, None)}) yield params for lang, description in lang_descriptions.items(): # Add descriptions which have no other values. params = MappedParams(study_group_id) params.set_language(lang) params.keyword_arguments.update({ cls._study_cls.study_groups.attr_description.name: description, cls._study_cls.study_groups.attr_uri.name: study_group_uri }) yield params def _iter_study_study_groups_as_mapped_params(self): """Iterate mapped study groups Use serStmt@ID to get MappedParams value for study_group. Use serName to get study_group name and serName@xml:lang to get language. Use serInfo to get study_group description and compare language in serInfo@xml:lang with serName@xml:lang, if they differ construct a new MappedParams with that language. DDI 2.5 documentation on <serStmt>: Series statement for the work at the appropriate level: marked-up document; marked-up document source; study; study description, other material; other material for study. The URI attribute is provided to point to a central Internet repository of series information. Repeat this field if the study is part of more than one series. Repetition of the internal content should be used to support multiple languages only. - https://ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/field_level_documentation.html :returns: generator iterating mapped Study.study_groups """ default_language = self.root_language for serstmt_element in self._findall('./stdyDscr/citation/serStmt'): study_group_id = serstmt_element.get('ID', None) study_group_uri = serstmt_element.get('URI', None) study_group_default_lang = serstmt_element.attrib.get('{%s}lang' % (self.NS['xml'],), default_language) for params in self._iter_params_from_sernames_serinfos( study_group_id, study_group_uri, study_group_default_lang, self._findall('./serName', serstmt_element), self._findall('./serInfo', serstmt_element)): yield params def _iter_related_publications_as_mapped_params(self): for relpubl_element in self._findall('./stdyDscr/othrStdyMat/relPubl'): description = element_strip_descendant_text(relpubl_element) lang = relpubl_element.attrib.get('{%s}lang' % (self.NS['xml'],), self.root_language) citation_elements = self._findall('./citation', relpubl_element) if citation_elements == []: params = MappedParams(None) params.set_language(lang) params.keyword_arguments[self._study_cls.related_publications.attr_description.name] = description yield params continue for citation_element in citation_elements: title_element = self._find('./titlStmt/titl', citation_element) if title_element is None: _logger.warning("No titlStmt/titl inside relPubl/citation. " "File is invalid DDI and no related publications " "can be parsed.") continue title = ''.join(title_element.itertext()) params = MappedParams(title) params.set_language(lang) # IDNo is repeatable inside titlStmt, but Kuha Study model does not support # multiple IDs for single title. ids_agencys = [] for idno_el in self._findall('./titlStmt/IDNo', citation_element): ids_agencys.append(( ''.join(idno_el.itertext()), idno_el.attrib.get('agency'))) if ids_agencys: identifier, identifier_agency = get_preferred_publication_id_agency_pair(ids_agencys) params.keyword_arguments.update({ self._study_cls.related_publications.attr_identifier.name: identifier, self._study_cls.related_publications.attr_identifier_agency.name: identifier_agency }) # distStmt may actually have multiple distDate-elements. Study model does not # support repetition inside contained element, so we can only support one # distDate for each relpubl. self._get_attr_and_set_param(params, self._study_cls.related_publications.attr_distribution_date.name, self._find('./distStmt/distDate', citation_element), 'date') # citation may have multiple holdings-elements. Study models does not support # repetition inside contained element, so we can only support one uri for # each relpubl self._get_attr_and_set_param(params, self._study_cls.related_publications.attr_uri.name, self._find('./holdings', citation_element), 'URI') params.keyword_arguments[self._study_cls.related_publications.attr_description.name] = description yield params @property def _study_maps(self): return [ (self._study_cls.add_identifiers, self._map_multi('./stdyDscr/citation/titlStmt/IDNo'). add_attribute(self._study_cls.identifiers.attr_agency.name, self._map_single('.', 'agency'))), (self._study_cls.add_study_titles, self._map_multi('./stdyDscr/citation/titlStmt/titl')), (self._study_cls.add_document_titles, self._map_multi('./docDscr/citation/titlStmt/titl')), (self._study_cls.add_parallel_titles, self._map_multi('./stdyDscr/citation/titlStmt/parTitl')), (self._study_cls.add_principal_investigators, self._map_multi('./stdyDscr/citation/rspStmt/AuthEnty'). set_value_getter(element_strip_descendant_text_by_tag( '%sExtLink' % ('{%s}' % (self.NS.get('ddi'),) if self.NS.get('ddi') else ''))). add_attribute(self._study_cls.principal_investigators.attr_organization.name, self._map_single('.', 'affiliation')). add_attribute(self._study_cls.principal_investigators.attr_external_link.name, self._map_single('./ExtLink')). add_attribute(self._study_cls.principal_investigators.attr_external_link_role.name, self._map_single('./ExtLink', 'role')). add_attribute(self._study_cls.principal_investigators.attr_external_link_title.name, self._map_single('./ExtLink', 'title')). add_attribute(self._study_cls.principal_investigators.attr_external_link_uri.name, self._map_single('./ExtLink', 'URI'))), (self._study_cls.add_publishers, self._map_multi('./docDscr/citation/prodStmt/producer'). add_attribute(self._study_cls.publishers.attr_abbreviation.name, self._map_single('.', 'abbr'))), (self._study_cls.add_data_collection_copyrights, self._map_multi('./stdyDscr/citation/prodStmt/copyright')), (self._study_cls.add_document_uris, self._map_multi('./docDscr/citation/holdings', 'URI'). add_attribute(self._study_cls.document_uris.attr_location.name, self._map_single('.', 'location')). add_attribute(self._study_cls.document_uris.attr_description.name, self._map_single('.'))), (self._study_cls.add_study_uris, self._map_multi('./stdyDscr/citation/holdings', 'URI'). add_attribute(self._study_cls.document_uris.attr_location.name, self._map_single('.', 'location')). add_attribute(self._study_cls.document_uris.attr_description.name, self._map_single('.'))), (self._study_cls.add_distributors, self._map_multi('./stdyDscr/citation/distStmt/distrbtr'). add_attribute(self._study_cls.distributors.attr_abbreviation.name, self._map_single('.', 'abbr')). add_attribute(self._study_cls.distributors.attr_uri.name, self._map_single('.', 'URI'))), (self._study_cls.add_publication_dates, self._map_multi('./stdyDscr/citation/verStmt/version', 'date')), (self._study_cls.add_publication_years, self._map_multi('./stdyDscr/citation/prodStmt/prodDate'). add_attribute(self._study_cls.publication_years.attr_distribution_date.name, self._map_single('./stdyDscr/citation/distStmt/distDate', 'date'), False)), (self._study_cls.add_abstract, self._map_multi('./stdyDscr/stdyInfo/abstract'). set_value_getter(element_remove_whitespaces)), (self._study_cls.add_classifications, self._map_multi('./stdyDscr/stdyInfo/subject/topcClas', 'ID'). add_attribute(self._study_cls.classifications.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.classifications.attr_uri.name, self._map_single('.', 'vocabURI')). add_attribute(self._study_cls.classifications.attr_description.name, self._map_single('.'))), (self._study_cls.add_keywords, self._map_multi('./stdyDscr/stdyInfo/subject/keyword', 'ID'). add_attribute(self._study_cls.keywords.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.keywords.attr_uri.name, self._map_single('.', 'vocabURI')). add_attribute(self._study_cls.keywords.attr_description.name, self._map_single('.'))), (self._study_cls.add_collection_periods, self._map_multi('./stdyDscr/stdyInfo/sumDscr/collDate', 'date'). add_attribute(self._study_cls.collection_periods.attr_event.name, self._map_single('.', 'event'))), (self._study_cls.add_study_area_countries, self._map_multi('./stdyDscr/stdyInfo/sumDscr/nation'). add_attribute(self._study_cls.study_area_countries.attr_abbreviation.name, self._map_single('.', 'abbr'))), (self._study_cls.add_geographic_coverages, self._map_multi('./stdyDscr/stdyInfo/sumDscr/geogCover')), (self._study_cls.add_analysis_units, self._map_multi('./stdyDscr/stdyInfo/sumDscr/anlyUnit/concept'). add_attribute(self._study_cls.analysis_units.attr_description.name, self._map_single('/..', localizable=True). set_value_getter(element_strip_descendant_text), provides_main_lang=True). add_attribute(self._study_cls.analysis_units.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.analysis_units.attr_uri.name, self._map_single('.', 'vocabURI'))), (self._study_cls.add_universes, self._map_multi('./stdyDscr/stdyInfo/sumDscr/universe'). add_attribute(self._study_cls.universes.attr_included.name, self._map_single('.', 'clusion'). set_value_conversion(str_equals('I', default=True)))), (self._study_cls.add_data_kinds, self._map_multi('./stdyDscr/stdyInfo/sumDscr/dataKind')), (self._study_cls.add_time_methods, self._map_multi('./stdyDscr/method/dataColl/timeMeth/concept'). add_attribute(self._study_cls.time_methods.attr_description.name, self._map_single('/..', localizable=True). set_value_getter(element_strip_descendant_text), provides_main_lang=True). add_attribute(self._study_cls.time_methods.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.time_methods.attr_uri.name, self._map_single('.', 'vocabURI'))), (self._study_cls.add_sampling_procedures, self._map_multi('./stdyDscr/method/dataColl/sampProc/concept'). add_attribute(self._study_cls.sampling_procedures.attr_description.name, self._map_single('/..', localizable=True). set_value_getter(element_strip_descendant_text), provides_main_lang=True). add_attribute(self._study_cls.sampling_procedures.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.sampling_procedures.attr_uri.name, self._map_single('.', 'vocabURI'))), (self._study_cls.add_collection_modes, self._map_multi('./stdyDscr/method/dataColl/collMode/concept'). add_attribute(self._study_cls.collection_modes.attr_description.name, self._map_single('/..', localizable=True). set_value_getter(element_strip_descendant_text), provides_main_lang=True). add_attribute(self._study_cls.collection_modes.attr_system_name.name, self._map_single('.', 'vocab')). add_attribute(self._study_cls.collection_modes.attr_uri.name, self._map_single('.', 'vocabURI'))), (self._study_cls.add_data_access, self._map_multi('./stdyDscr/dataAccs/useStmt/restrctn')), (self._study_cls.add_citation_requirements, self._map_multi('./stdyDscr/dataAccs/useStmt/citReq')), (self._study_cls.add_deposit_requirements, self._map_multi('./stdyDscr/dataAccs/useStmt/deposReq')), (self._study_cls.add_data_access_descriptions, self._map_multi('./stdyDscr/dataAccs/useStmt/conditions')), (self._study_cls.add_file_names, self._map_multi('./fileDscr/fileTxt/fileName')), (self._study_cls.add_instruments, self._map_multi('./stdyDscr/othrStdyMat/relMat/citation/titlStmt/IDNo'). add_attribute(self._study_cls.instruments.attr_instrument_name.name, self._map_single('/../titl'))), (self._study_cls.add_copyrights, self._map_multi('./docDscr/citation/prodStmt/copyright')), (self._study_cls.add_funding_agencies, self._map_multi('./stdyDscr/citation/prodStmt/fundAg'). add_attribute(self._study_cls.funding_agencies.attr_abbreviation.name, self._map_single('.', 'abbr')). add_attribute(self._study_cls.funding_agencies.attr_role.name, self._map_single('.', 'role'))), (self._study_cls.add_grant_numbers, self._map_multi('./stdyDscr/citation/prodStmt/grantNo'). add_attribute(self._study_cls.grant_numbers.attr_agency.name, self._map_single('.', 'agency')). add_attribute(self._study_cls.grant_numbers.attr_role.name, self._map_single('.', 'role'))) ] @property def studies(self): """Parse XML to create and populate :obj:`kuha_common.document_store.records.Study`. :returns: Generator to Populate Document Store Study record. """ if self.study_number is None: self._parse_study_number() study = self._study_cls() study.add_study_number(self.study_number_identifier) self._map_to_record(study, self.root_element, self._study_maps) for mapped_study_group in self._iter_study_study_groups_as_mapped_params(): study.add_study_groups(*mapped_study_group.arguments, **mapped_study_group.keyword_arguments) for mapped_related_publications in self._iter_related_publications_as_mapped_params(): study.add_related_publications(*mapped_related_publications.arguments, **mapped_related_publications.keyword_arguments) yield study @property def _variable_maps(self): return [ (self._variable_cls.add_variable_name, self._map_single('.', 'name', required=True). set_value_conversion(as_valid_identifier)), (self._variable_cls.add_question_identifiers, self._map_multi('./qstn', 'ID', localizable=False) .set_value_conversion(as_valid_identifier)), (self._variable_cls.add_variable_labels, self._map_multi('./labl')), (self._variable_cls.add_codelist_codes, self._map_multi('./catgry', 'ID'). add_attribute(self._variable_cls.codelist_codes.attr_label.name, self._map_multi('./labl'), provides_main_lang=True). add_attribute(self._variable_cls.codelist_codes.attr_missing.name, self._map_single('.', 'missing'). set_value_conversion(str_equals('Y', False)))), ] @property def variables(self): """Parse XML to create and populate multiple :obj:`kuha_common.document_store.records.Variable` instances. :returns: Generator to populate multiple Document Store Variable records. """ if self.study_number is None: self._parse_study_number() maps = self._variable_maps for var_element in self._findall('./dataDscr/var'): variable = self._variable_cls() variable.add_study_number(self.study_number_identifier) self._map_to_record(variable, var_element, maps) yield variable @property def _question_maps(self): return [ (self._question_cls.add_question_identifier, self._map_single('.', 'ID', required=True) .set_value_conversion(as_valid_identifier)), (self._question_cls.add_question_texts, self._map_multi('./qstnLit')) ] @property def questions(self): """Parse XML to create and populate multiple :obj:`kuha_common.document_store.records.Question` instances. :returns: Generator to populate multiple Document Store Question records. """ if self.study_number is None: self._parse_study_number() research_instruments = [] resinstru_map = self._map_multi('./stdyDscr/method/dataColl/resInstru')\ .set_value_getter(element_remove_whitespaces) for instru_params in resinstru_map(self.root_element, self.root_language, self.NS): research_instruments.append(instru_params.arguments) maps = self._question_maps for var_element in self._findall('./dataDscr/var'): variable_name = as_valid_identifier(var_element.get('name')) codes = [] for code_params in self._map_multi('./code')(var_element, self.root_language, self.NS): codes.append(code_params.arguments) for qstn_element in self._findall('./qstn', var_element): question = self._question_cls() question.add_study_number(self.study_number_identifier) if variable_name: question.add_variable_name(variable_name) for instru in research_instruments: question.add_research_instruments(*instru) for _code in codes: question.add_codelist_references(*_code) self._map_to_record(question, qstn_element, maps) yield question @property def _study_group_maps(self): return [ (self._studygroup_cls.add_study_group_identifier, self._map_single('.', 'ID', required=True) .set_value_conversion(as_valid_identifier)), (self._studygroup_cls.add_uris, self._map_single('.', 'URI', localizable=True)), (self._studygroup_cls.add_study_group_names, self._map_multi('./serName')), (self._studygroup_cls.add_descriptions, self._map_multi('./serInfo') .set_value_getter(element_remove_whitespaces)) ] @property def study_groups(self): """Parse XML to create and populate multiple :obj:`kuha_common.document_store.records.StudyGroup` instances. :returns: Generator to populate multiple Document Store StudyGroup records. """ study_groups = [] def update_or_append_study_group(study_group): for primary_study_group in study_groups: if primary_study_group.updates(study_group): return study_groups.append(study_group) if self.study_number is None: self._parse_study_number() maps = self._study_group_maps for serstmt_element in self._findall('./stdyDscr/citation/serStmt'): study_group = self._studygroup_cls() study_group.add_study_numbers(self.study_number_identifier) self._map_to_record(study_group, serstmt_element, maps) update_or_append_study_group(study_group) for study_group in study_groups: yield study_group
[docs]class DDI122NesstarRecordParser(DDI122RecordParser): _expected_root_tag = '{http://www.icpsr.umich.edu/DDI}codeBook'
[docs]class DDI25RecordParser(DDI122RecordParser): """Parse Document Store records from DDI 2.5 XML. """ #: XML namespaces NS = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'ddi': 'ddi:codebook:2_5', 'xml': 'http://www.w3.org/XML/1998/namespace'} _expected_root_tag = '{ddi:codebook:2_5}codeBook' @classmethod def _xpath(cls, xpath): """Every xpath is prepended with namespace.""" prepended = '' for index, step in enumerate(xpath.split('/')): if step in ('', '.', '..'): prepended += '%s' % (step,) if index == 0 else '/%s' % (step,) continue prepended += 'ddi:%s' % (step,) if index == 0 else '/ddi:%s' % (step,) return super()._xpath(prepended)