Source code for modos.genomics.cram

"""Utilities to interact with genomic intervals in CRAM files."""

from pathlib import Path

import pysam
import modos_schema.datamodel as model


[docs] def extract_metadata( instance: model.AlignmentSet, base_path: Path ) -> list[model.ReferenceSequence]: """Extract metadata from the CRAM file header and convert specific attributes according to the modo schema.""" cram = pysam.AlignmentFile(str(base_path / instance.data_path), mode="rc") cram_head = cram.header ref_list: list = [] for refseq in cram_head.get("SQ"): refseq_mod = model.ReferenceSequence( id=create_sequence_id(refseq.get("SN"), refseq.get("M5")), name=refseq.get("SN"), sequence_md5=refseq.get("M5"), source_uri=refseq.get("UR"), description=refseq.get("DS"), ) ref_list.append(refseq_mod) # NOTE: Could also extract species name, sample name, sequencer etc. here return ref_list
[docs] def validate_cram_files(cram_path: str): """Validate CRAM files using pysam. Checks if the file is sorted and has an index."""
# NOTE: Not a priority # TODO: # Check if sorted # Check if index exists # Check if reference exists # TODO: Add functions to edit CRAM files (liftover)
[docs] def create_sequence_id(name: str, sequence_md5: str) -> str: """Helper function to create a unique id from a sequence name and md5 hash""" return name + "_" + sequence_md5[:6]