Source code for oligo.design

from __future__ import print_function, division

import argparse
import re
import sys

import numpy as np

from oligo.tools import Tools

recognition_seq = {'DpnII': 'GATC',
                   'NlaIII': 'CATG',
                   'HindIII': 'AAGCTT'}

def _check_value(values, labels):
    for value, label in zip(values, labels):
        if (value<1) | isinstance(value, float):
            raise ValueError(
                '{} must be an integer greater than 0'.format(label))
            break
    else:
        return None
    
def _compile_chr_regex(genome):
    if genome.startswith('hg'):
        twenties = '2[0-2]|'
    elif genome.startswith('mm'):
        twenties = ''
    valid_chroms = re.compile('^chr([1-9]|1[0-9]|'+twenties+'X|Y)$')
    
    return valid_chroms

def _create_key(chrom, *args):
    return (':'.join((chrom, '-'.join((map(str, x + ('000','000','X')))))) for x in args)

def _create_key_frag(chrom, oligo_coor, frag_coor, side):
    key = ':'.join((chrom, '-'.join(map(str, (oligo_coor + frag_coor)))))
    key = '-'.join((key, side))
            
    return key

def _get_sequence(seq, *args):
    return (str(seq[x[0]:x[1]]) for x in args)  # returns a generator so that all sequences outside of the dictionary are not retained in the memory

def _validate_chrom(chrom, regex):
    if not regex.match(chrom):
        raise ChromosomeError('Unrecognised chromosome {}. Skipping.')
    return None

class ChromosomeError(Exception):
    pass

class FragmentError(Exception):
    pass

class FragmentMixin(object):
    
    def _get_fragment_seqs(self, chrom, start, stop, chrom_seq=''):
        frag_length = stop - start
        if frag_length < self.oligo:
            raise FragmentError('{} is too small to design oligos in. Skipping.')
        
        left_coor = (start, start + self.oligo)
        right_coor = (stop - self.oligo, stop)
        frag_coor = (start, stop)
        
        left_key = _create_key_frag(chrom, left_coor, frag_coor, 'L')
        right_key = _create_key_frag(chrom, right_coor, frag_coor, 'R')
        
        if left_key in self.oligo_seqs:
            raise FragmentError('{} is redundant to another position. Skipping.')
        
        if not chrom_seq: chrom_seq = self.genome_seq[chrom].seq.upper()
        left_seq, right_seq = _get_sequence(chrom_seq, *(left_coor, right_coor))
        
        self.oligo_seqs[left_key] = left_seq     
        if frag_length > self.oligo: self.oligo_seqs[right_key] = right_seq
        
        return None

[docs] class Capture(Tools, FragmentMixin): """Designs oligos for Capture-C""" __doc__ += Tools.__doc__
[docs] def gen_oligos(self, bed, enzyme='DpnII', oligo=70): r"""Generates oligos flanking restriction fragments that encompass the coordinates supplied in the bed file Parameters ---------- bed : str Path to tab-delimited bed file containing a list of coordinates for viewpoints in the capture experiment. Must be in the format 'chr'\\t'start'\\t'stop'\\t'name'\\n enzyme : {'DpnII', 'NlaIII', 'HindIII'}, optional The enzyme for digestion, default = DpnII oligo : int, optional The length of the oligo to design (bp), default = 70 Returns ------- self : object """ _check_value((oligo,), ('Oligo size',)) self._create_attr(oligo) cut_sites = {} cut_size = len(recognition_seq[enzyme]) rec_seq = re.compile(recognition_seq[enzyme]) print('Generating oligos...') for chrom, attr in self.genome_seq.items(): chrom_seq = str(attr.seq).upper() cut_sites[chrom] = [cut_site.start() for cut_site in rec_seq.finditer(chrom_seq)] cut_sites[chrom] = np.array(cut_sites[chrom]) valid_chroms = _compile_chr_regex(self.genome) with open(bed) as viewpoints: for vp in viewpoints: chrom, vp_start, vp_stop, name = vp.strip().split('\t') try: _validate_chrom(chrom, valid_chroms) except ChromosomeError as e: print(str(e).format(chrom), file=sys.stderr) continue vp_start = int(vp_start) try: frag_start = cut_sites[chrom][cut_sites[chrom] <= vp_start][-1] frag_stop = cut_sites[chrom][cut_sites[chrom] > vp_start][0] + cut_size # currently this picks an adjacent fragment if the site is in a cutsite; are we okay with that? except IndexError: print('Viewpoint {} is not in a closed fragment. ' 'Skipping.'.format(name)) continue try: self._get_fragment_seqs(chrom, frag_start, frag_stop) except FragmentError as e: frag_id = '{}:{}-{} ({}) is in a fragment that'.format( chrom, vp_start, vp_stop, name) print(str(e).format(frag_id), file=sys.stderr) continue frag_key = '{}:{}-{}'.format(chrom, frag_start, frag_stop) self._assoc[frag_key] = '{}{},'.format( self._assoc.get(frag_key, ''), name) print('\t...complete.') if __name__ != '__main__': print('Oligos stored in the oligo_seqs attribute') return self
def __str__(self): return 'Capture-C oligo design object for the {} genome'.format( self.genome)
[docs] class Tiled(Tools, FragmentMixin): """Designs oligos adjacent to each other or on adjacent fragments""" __doc__ += Tools.__doc__
[docs] def gen_oligos_capture(self, chrom, region='', enzyme='DpnII', oligo=70): """Designs oligos for multiple adjacent restriction fragments across a specified region of a chromosome, or for the entire chromosome. Parameters ---------- chrom : str Chromosome number/letter e.g. 7 or X region : str, optional The region of the chromosome to design oligos, e.g. 10000-20000; omit this option to design oligos over the entire chromosome enzyme : {'DpnII', 'NlaIII', 'HindIII'}, optional The enzyme for digestion, default=DpnII oligo : int, optional The length of the oligos to design (bp), default=70 Returns ------- self : object """ _check_value((oligo,), ('Oligo size',)) self._create_attr(oligo) if 'chr' not in str(chrom): chrom = ''.join(('chr'+str(chrom))) chrom_seq = self.genome_seq[chrom].seq.upper() start, stop = (0, len(chrom_seq)) if not region else map(int, region.split('-')) rec_seq = re.compile(recognition_seq[enzyme]) cut_size = len(recognition_seq[enzyme]) cut_sites = [cut_site.start()+start for cut_site in rec_seq.finditer(str(chrom_seq[start:stop]))] print('Generating oligos...') for i in range(len(cut_sites)-1): j = i + 1 frag_start = cut_sites[i] frag_stop = cut_sites[j]+cut_size try: self._get_fragment_seqs(chrom, frag_start, frag_stop, chrom_seq=chrom_seq) except FragmentError as e: frag_id = 'The fragment {}:{}-{}'.format(chrom, frag_start, frag_stop) print(str(e).format(frag_id), file=sys.stderr) continue print('\t...complete.') if __name__ != '__main__': print('Oligos stored in the oligo_seqs attribute') return self
[docs] def gen_oligos_contig(self, chrom, region='', step=70, oligo=70): """Designs adjacent oligos based on a user-defined step size, across a specified region of a chromosome, or for the entire chromosome. Parameters ---------- chrom : str Chromosome number/letter e.g. 7 or X region : str, optional The region of the chromosome to design oligos, e.g. 10000-20000; omit this option to design oligos over the step : int, optional The step size, or how close adjacent oligos should be, default=70; for end-to-end oligos, set `step` to equal `oligo` oligo : int, optional The length of the oligos to design (bp), default=70 Raises ------ ValueError If `step` or `oligo` <1 or not an integer Returns ------- self : object """ _check_value((step, oligo), ('Step size', 'Oligo size')) self._create_attr(oligo) if not chrom.startswith('chr'): chrom = ''.join(('chr'+str(chrom))) chrom_seq = self.genome_seq[chrom].seq.upper() start, stop = (0, len(chrom_seq)) if not region else map( int, region.split('-')) stop = stop - oligo print('Generating oligos...') coors = [(x, x + oligo) for x in range(start, stop+1, step)] sequences = _get_sequence(chrom_seq, *coors) keys = _create_key(chrom, *coors) self.oligo_seqs.update(zip(keys, sequences)) print('\t...complete.') if __name__ != '__main__': print('Oligos stored in the oligo_seqs attribute') return self
def __str__(self): return 'Tiled Capture oligo design object for the {} genome'.format( self.genome)
[docs] class OffTarget(Tools): """Designs oligos adjacent to potential CRISPR off-target sites""" __doc__ += Tools.__doc__
[docs] def gen_oligos(self, bed, oligo=70, step=10, max_dist=200): r"""Designs oligos adjacent to user-supplied coordinates for potential CRISPR off-target cleavage sites Parameters ---------- bed : str Path to tab-delimited bed file containing a list of coordinates for viewpoints in the capture experiment; must be in the format 'chr'\\t'start'\\t'stop'\\t'name'\\n oligo : int, optional The length of the oligo to design (bp), default=70 step : int, optional The step size, or how close adjacent oligos should be, default=10; for end-to-end oligos, set `step` to equal `oligo` max_dist : int, optional The maximum distance away from the off-target site to design oligos to, default = 200 Returns ------- self : object """ _check_value((step, oligo, max_dist), ('Step size', 'Oligo size', 'Maximum distance')) self._create_attr(oligo) valid_chroms = _compile_chr_regex(self.genome) print('Generating oligos...') with open(bed) as crispr_sites: for site in crispr_sites: chrom, start, stop, name = site.strip().split('\t') try: _validate_chrom(chrom, valid_chroms) except ChromosomeError as e: print(str(e).format(chrom), file=sys.stderr) continue start, stop = map(int, (start, stop)) chrom_seq = self.genome_seq[chrom].seq.upper() chrom_length = len(chrom_seq) exc_start = start-10 exc_stop = stop+10 upstream = [(x-oligo, x) for x in range(start-max_dist+oligo, exc_start+1, step) if x-oligo>=0] downstream = [(x, x+oligo) for x in range(exc_stop, stop+max_dist-oligo+1, step) if x+oligo<=chrom_length] all_coors = upstream+downstream sequences = _get_sequence(chrom_seq, *all_coors) keys = list(_create_key(chrom, *all_coors)) self.oligo_seqs.update(zip(keys, sequences)) self._assoc.update({x[:-10]: name for x in keys}) print('\t...complete.') if __name__ != '__main__': print('Oligos stored in the oligo_seqs attribute') return self
def __str__(self): return 'OffTarget Capture oligo design object for the {} genome'.format( self.genome)
if __name__ == '__main__': parser = argparse.ArgumentParser() args = parser.parse_args(sys.argv[2:]) if not args.blat and not args.star_index: msg = '-s/--star_index argument is required if --blat is not selected' parser.error(msg)