Source code for modos.helpers.schema

"""Introspection utilities for the MODO schema.

This module provides helpers for accessing the schema structure
and for converting instances to different representations.
"""
from enum import Enum
from functools import lru_cache, reduce
from pathlib import Path
from typing import Any, Mapping, Optional, Union
from urllib.parse import urlparse

import zarr
from linkml_runtime.dumpers import rdflib_dumper
from linkml_runtime.utils.schemaview import SchemaView
from rdflib import Graph
from rdflib.term import URIRef

import modos_schema.datamodel as model
import modos_schema.schema as schema


[docs] SCHEMA_PATH = Path(schema.__path__[0]) / "modos_schema.yaml"
[docs] def class_from_name(name: str): class_names = list(load_schema().all_classes().keys()) if name not in class_names: raise ValueError(f"Unknown class name: {name}") return getattr(model, name)
[docs] def dict_to_instance(element: Mapping[str, Any]) -> Any: elem_type = element.get("@type") target_class = class_from_name(elem_type) return target_class( **{k: v for k, v in element.items() if k not in "@type"} )
[docs] def is_full_id(element_id: str) -> bool: """Checks if an element_id contains the element type as prefix. Examples -------- >>> is_full_id("sample1") False >>> is_full_id("data/test") True >>> is_full_id("/assay/test_assay") True """ etypes = [elem.value + "/" for elem in ElementType] extended_etypes = etypes + ["/" + etype for etype in etypes] return element_id.startswith(tuple(extended_etypes))
[docs] def set_haspart_relationship( child_class: str, child_path: str, parent_group: zarr.hierarchy.Group, ): """Add element to the hasPart attribute of a parent zarr group""" parent_type = getattr( model, parent_group.attrs.get("@type"), ) has_prop = get_haspart_property(child_class) parent_slots = parent_type.__match_args__ if has_prop not in parent_slots: raise ValueError( f"Cannot make {child_path} part of {parent_group.name}: {parent_type} does not have property {has_prop}" ) # has_part is multivalued if has_prop not in parent_group.attrs: parent_group.attrs[has_prop] = [] parent_group.attrs[has_prop] += [child_path]
[docs] def update_haspart_id( element: model.DataEntity | model.Sample | model.Assay | model.ReferenceGenome | model.MODO, ): """update the id of the has_part property of an element to use the full id including its type""" haspart_names = load_schema().slot_children("has_part") haspart_list = [ haspart for haspart in haspart_names if haspart in vars(element).keys() ] if len(haspart_list) > 0: for has_part in haspart_list: haspart_type = get_slot_range(has_part) type_name = ElementType.from_model_name(haspart_type).value updated_ids = [ id if is_full_id(id) else f"{type_name}/{id}" for id in getattr(element, has_part) ] setattr(element, has_part, updated_ids) return element
[docs] def set_data_path( element: dict, source_file: Optional[Union[Path, str]] = None ) -> dict: """Set the data_path attribute, if it is not specified to the modo root.""" if source_file and not element.get("data_path"): element["data_path"] = Path(source_file).name return element
[docs] class UserElementType(str, Enum): """Enumeration of element types exposed to the user."""
[docs] SAMPLE = "sample"
[docs] ASSAY = "assay"
[docs] DATA_ENTITY = "data"
[docs] REFERENCE_GENOME = "reference"
[docs] def get_target_class( self, ) -> type: """Return the target class for the element type.""" match self: case UserElementType.SAMPLE: return model.Sample case UserElementType.ASSAY: return model.Assay case UserElementType.DATA_ENTITY: return model.DataEntity case UserElementType.REFERENCE_GENOME: return model.ReferenceGenome case _: raise ValueError(f"Unknown element type: {self}")
@classmethod
[docs] def from_object(cls, obj): """Return the element type from an object.""" match obj: case model.Sample(): return UserElementType.SAMPLE case model.Assay(): return UserElementType.ASSAY case model.DataEntity(): return UserElementType.DATA_ENTITY case model.ReferenceGenome(): return UserElementType.REFERENCE_GENOME case _: raise ValueError(f"Unknown object type: {type(obj)}")
[docs] class ElementType(str, Enum): """Enumeration of all element types."""
[docs] SAMPLE = "sample"
[docs] ASSAY = "assay"
[docs] DATA_ENTITY = "data"
[docs] REFERENCE_GENOME = "reference"
[docs] REFERENCE_SEQUENCE = "sequence"
[docs] def get_target_class( self, ) -> type: """Return the target class for the element type.""" match self: case ElementType.SAMPLE: return model.Sample case ElementType.ASSAY: return model.Assay case ElementType.DATA_ENTITY: return model.DataEntity case ElementType.REFERENCE_GENOME: return model.ReferenceGenome case ElementType.REFERENCE_SEQUENCE: return model.ReferenceSequence case _: raise ValueError(f"Unknown element type: {self}")
@classmethod
[docs] def from_object(cls, obj): """Return the element type from an object.""" match obj: case model.Sample(): return ElementType.SAMPLE case model.Assay(): return ElementType.ASSAY case model.DataEntity(): return ElementType.DATA_ENTITY case model.ReferenceGenome(): return ElementType.REFERENCE_GENOME case model.ReferenceSequence(): return ElementType.REFERENCE_SEQUENCE case _: raise ValueError(f"Unknown object type: {type(obj)}")
@classmethod
[docs] def from_model_name(cls, name: str): """Return the element type from an object name.""" match name: case "Sample": return ElementType.SAMPLE case "Assay": return ElementType.ASSAY case "DataEntity": return ElementType.DATA_ENTITY case "ReferenceGenome": return ElementType.REFERENCE_GENOME case "ReferenceSequence": return ElementType.REFERENCE_SEQUENCE case _: raise ValueError(f"Unknown object type: {name}")
[docs] def is_uri(text: str): """Checks if input is a valid URI.""" try: result = urlparse(text) return all([result.scheme, result.netloc]) except AttributeError: return False
@lru_cache(1)
[docs] def load_schema() -> SchemaView: """Return a view over the schema structure.""" return SchemaView(SCHEMA_PATH)
@lru_cache(1)
[docs] def load_prefixmap() -> Any: """Load the prefixmap.""" return SchemaView(SCHEMA_PATH, merge_imports=False).schema.prefixes
[docs] def get_slots(target_class: type, required_only=False) -> list[str]: """Return a list of required slots for a class.""" slots = [] class_slots = target_class.__match_args__ for slot_name in class_slots: if not required_only or load_schema().get_slot(slot_name).required: slots.append(slot_name) return slots
[docs] def instance_to_graph(instance) -> Graph: # NOTE: This is a hack to get around the fact that the linkml # stores strings instead of URIRefs for prefixes. prefixes = { p.prefix_prefix: URIRef(p.prefix_reference) for p in load_prefixmap().values() } g = rdflib_dumper.as_rdf_graph( instance, prefix_map=prefixes, schemaview=load_schema(), ) # NOTE: This is a hack to get around the fact that the linkml's # rdf dumper does not iunclude schema:identifier in the graph. # Patch schema -> http://schema.org (rdflib's default is https) g.bind("schema", "http://schema.org/", replace=True) try: id_slot = ( load_schema().get_identifier_slot(type(instance).__name__).slot_uri ) g.add( ( URIRef(instance.id), g.namespace_manager.expand_curie(str(id_slot)), URIRef(instance.id), ) ) except AttributeError: pass return g
[docs] def get_slot_range(slot_name: str) -> str: """Return the class-independent range of a slot.""" return load_schema().get_slot(slot_name).range
[docs] def get_enum_values(enum_name: str) -> Optional[list[str]]: return list(load_schema().get_enum(enum_name).permissible_values.keys())
[docs] def get_haspart_property(child_class: str) -> Optional[str]: """Return the name of the "has_part" property for a target class. If no such property is in the schema, return None. Examples -------- >>> get_haspart_property('AlignmentSet') 'has_data' >>> get_haspart_property('Assay') 'has_assay' """ # find all subproperties of has_part prop_names = load_schema().slot_children("has_part") for prop_name in prop_names: targets = get_slot_range(prop_name) if isinstance(targets, str): targets = [targets] # When considering the slot range, # include subclasses or targets sub_targets = map(load_schema().get_children, targets) sub_targets = reduce(lambda x, y: x + y, sub_targets) all_targets = targets + [t for t in sub_targets if t] if child_class in all_targets: return prop_name return None