Source code for modos.genomics.region

from __future__ import annotations
import math
from dataclasses import dataclass
from urllib.parse import parse_qs, urlparse
from typing import Optional

import pysam


@dataclass(order=True)
[docs] class Region: """Genomic region consisting of a chromosome (aka reference) name and a 0-indexed half-open coordinate interval. Note that the end may not be specified, in which it will be set to math.inf. """
[docs] chrom: str
[docs] start: int
[docs] end: int | float
[docs] def __post_init__(self): if not self.chrom: raise ValueError("Chromosome must be specified") if self.start < 0: raise ValueError("Start must be non-negative") if self.end < self.start: raise ValueError("End must be greater than or equal to start")
[docs] def to_htsget_query(self): """Serializes the region into an htsget URL query. Example ------- >>> Region(chrom='chr1', start=0, end=100).to_htsget_query() 'referenceName=chr1&start=0&end=100' """ query = f"referenceName={self.chrom}&start={self.start}" if self.end != math.inf: query += f"&end={self.end}" return query
[docs] def to_tuple(self) -> tuple[str, Optional[int], Optional[int]]: """Return the region as a simple tuple.""" return ( self.chrom, self.start, int(self.end) if self.end != math.inf else None, )
@classmethod
[docs] def from_htsget_query(cls, url: str): """Instantiate from an htsget URL query Example ------- >>> Region.from_htsget_query( ... "http://localhost/htsget/reads/ex/demo1?format=CRAM&referenceName=chr1&start=0" ... ) Region(chrom='chr1', start=0, end=inf) """ query = parse_qs(urlparse(url).query) try: chrom = query["referenceName"][0] except KeyError: raise KeyError("referenceName is missing") start = int(query.get("start", [0])[0]) end = float(query.get("end", [math.inf])[0]) return Region(chrom, start, end)
@classmethod
[docs] def from_ucsc(cls, ucsc: str) -> Region: """Instantiate from a UCSC-formatted region string. Example ------- >>> Region.from_ucsc('chr-1ba:10-320') Region(chrom='chr-1ba', start=10, end=320) >>> Region.from_ucsc('chr1:-320') Region(chrom='chr1', start=0, end=320) >>> Region.from_ucsc('chr1:10-') Region(chrom='chr1', start=10, end=inf) >>> Region.from_ucsc('chr1:10') Region(chrom='chr1', start=10, end=inf) Note ---- For more information about the UCSC coordinate system, see: http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms """ try: chrom, interval = ucsc.split(":") except ValueError: # no : separator -> only chrom specified return cls(ucsc, 0, math.inf) try: start, end = interval.split("-") except ValueError: # no - separator -> only start specified return cls(chrom, int(interval), math.inf) # Allow empty strings for start and end start = 0 if start == "" else int(start) end = math.inf if end == "" else int(end) return cls(chrom, start, end)
@classmethod
[docs] def from_pysam( cls, record: pysam.VariantRecord | pysam.AlignedSegment ) -> Region: match record: case pysam.VariantRecord(): chrom = record.chrom start = record.start end = record.stop case pysam.AlignedSegment(): chrom = record.reference_name start = record.reference_start end = record.reference_end case _: raise TypeError( "record must be a pysam.VariantRecord or pysam.AlignedSegment" ) match (chrom, start, end): case str(), int(), int(): return cls(chrom, start, end) case _: raise ValueError("Record must have coordinates")
[docs] def overlaps(self, other: Region) -> bool: """Checks if other in self. This check if any portion of other overlaps with self. """ same_chrom = self.chrom == other.chrom starts_in = self.start <= other.start <= self.end ends_in = self.start <= other.end <= self.end return same_chrom and (starts_in or ends_in)
[docs] def contains(self, other: Region) -> bool: """Checks if other is fully contained in self.""" same_chrom = self.chrom == other.chrom starts_in = self.start <= other.start <= self.end ends_in = self.start <= other.end <= self.end return same_chrom and starts_in and ends_in