Source code for modos.genomics.formats
from __future__ import annotations
from enum import Enum
from pathlib import Path
from typing import Iterator, Optional
import pysam
from .region import Region
[docs]
class GenomicFileSuffix(tuple, Enum):
"""Enumeration of all supported genomic file suffixes."""
[docs]
VCF = (".vcf", ".vcf.gz")
[docs]
FASTA = (".fasta", ".fa")
[docs]
FASTQ = (".fastq", ".fq")
@classmethod
[docs]
def from_path(cls, path: Path) -> GenomicFileSuffix:
for genome_ft in cls:
if "".join(path.suffixes) in genome_ft.value:
return genome_ft
supported = [fi_format for fi_format in cls]
raise ValueError(
f'Unsupported file format: {"".join(path.suffixes)}.\n'
f"Supported formats:{supported}"
)
[docs]
def get_index_suffix(self) -> str:
"""Return the supported index suffix related to a genomic filetype"""
match self.name:
case "BAM" | "SAM":
return ".bai"
case "BCF":
return ".csi"
case "CRAM":
return ".crai"
case "FASTA" | "FASTQ":
return ".fai"
case "VCF":
return ".tbi"
[docs]
def to_htsget_endpoint(self) -> str:
"""Return the htsget endpoint for a genomic file type"""
match self.name:
case "BAM" | "CRAM":
return "reads"
case "VCF" | "BCF":
return "variants"
case _:
raise ValueError(f"No htsget endpoint for format {self.name}")
[docs]
def read_pysam(
path: Path,
region: Optional[Region] = None,
**kwargs,
) -> Iterator[pysam.AlignedSegment | pysam.VariantRecord]:
"""Automatically instantiate a pysam file object from input path and passes any additional kwarg to it."""
out_fileformat = GenomicFileSuffix.from_path(Path(path)).name
match out_fileformat:
case "CRAM" | "BAM":
pysam_func = pysam.AlignmentFile
case "VCF" | "BCF":
pysam_func = pysam.VariantFile
case _:
raise ValueError(f"Unsupported output file type.")
pysam_handle = pysam_func(str(path), **kwargs)
if region is None:
stream = (rec for rec in pysam_handle)
else:
stream = pysam_handle.fetch(*region.to_tuple())
return stream