ó
w	YTc        
   @   s—   d  d l  Z d  d l Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d d „ Z
 d	 „  Z d
 d „ Z d e e e d d d d e d „	 Z d S(   iÿÿÿÿNc         C   sY   g  } t  |  d ƒ } x! | D] } | j | j ƒ  ƒ q Wg  | D] } t t | ƒ ^ q@ S(   s`   
    Loads a matrix from a file given.
    Returns matrix as an array of arrays of numbers.
    t   r(   t   opent   appendt   splitt   mapt   float(   t   filenamet   arrayt   filet   linet   i(    (    s   ./MOODS/__init__.pyt   load_matrix   s
    c         C   sC   g  |  D] } | ^ q } | j  ƒ  x | D] } | j  ƒ  q+ W| S(   s-   
    Creates a reverse complement of PWM
    (   t   reverse(   t   matrixt   rowR    t   me(    (    s   ./MOODS/__init__.pyt   reverse_complement   s
    
c         C   s`   g  } xS t  t |  d ƒ ƒ D]; } g  } x |  D] } | j | | ƒ q0 W| j | ƒ q W| S(   s-   
    Creates a transpose of matrix array
    i    (   t   ranget   lenR   (   R   t   resR
   t   tmpt   g(    (    s   ./MOODS/__init__.pyt	   transpose   s    c         C   s   t  t t t |  ƒ ƒ ƒ S(   s.   
    Calculates a maximum score of matrix
    (   t   sumR   t   maxR   (   R   (    (    s   ./MOODS/__init__.pyt	   max_score%   s    c         C   s   t  j t |  ƒ | ƒ S(   sw   
    Estimates the background distribution of nucleotides
    from seq. The pseudocount ps is added to all counts.
    (   t   _cmodulet   _bg_from_sequencet   str(   t   seqt   ps(    (    s   ./MOODS/__init__.pyt   bg_from_sequence+   s    c         C   s   t  j |  | | ƒ S(   s  
    Calculates an absolute threshold from a probability value.
    Returns: 
              A threshold value T such that the probability that
              the distribution bg generates a sequence scoring
              at least T on the input matrix is p.
    (   R   t   _threshold_from_p(   R   t   bgt   p(    (    s   ./MOODS/__init__.pyt   threshold_from_p2   s    c         C   s[   t  j |  | | ƒ } | rW g  | D], } g  | D] } | t j | ƒ ^ q/ ^ q" } n  | S(   s<  
    Calculates a log-odds matrix from a position frequency matrix
    Returns: 
              The input PWM matrix transformed to log-odds scores.
              The score for nucleotide N and position i is
              
                 log ( matrix[N][i] + ps * bg[N] ) / C[i]
                   - log ( bg[N] ),
                   
              where C[i] is the sum of terms
              
                 matrix[N][i] + ps * bg[N]
                 
              for all nucleotides N. 
    Parameters: 
             Obligatory:
                matrix
                    Input PWM as a float or integer matrix.
                bg
                    Normalised background distribution given as
                    a list of four floats.
                ps
                    Multiplier for pseudocounts. If matrix is
                    a frequency matrix, you may want to
                    set this to zero.
             Optional:
                log_base  
                    Base for logarithms. Defaults to natural
                    logarithm if None is given.
    (   R   t   _count_log_oddst   matht   log(   R   R!   R   t   log_baset   retR   t   val(    (    s   ./MOODS/__init__.pyt   count_log_odds<   s    <c         C   s   t  t d „  |  ƒ ƒ S(   s/   
    Calculates a total number of matches.
    c         S   s
   t  |  ƒ S(   N(   R   (   t   x(    (    s   ./MOODS/__init__.pyt   <lambda>f   s    (   R   R   (   t
   matchArray(    (    s   ./MOODS/__init__.pyt   total_matchesb   s    i   c         C   s!   g  t  |  ƒ D] } d |  ^ q S(   s6   
    Creates a flat background distribution table
    g      ð?(   R   (   t   sizeR
   (    (    s   ./MOODS/__init__.pyt   flatbgh   s    i   t   lfi   c      	   C   s  t  |  ƒ } t  | ƒ } | s0 t |  | ƒ } n= t | ƒ t t ƒ  ƒ k s] t  | ƒ d k rm t d ƒ ‚ d St | ƒ t t ƒ  ƒ k r¤ g  | D] } | ^ q } n" t  | ƒ | k sÆ t d ƒ ‚ d S| rü | g  | D] } t | ƒ ^ qÖ } d | } n  | r-g  | D] } t | | | | ƒ ^ q	} n  | rrg  t t  | ƒ ƒ D]# } t	 j
 | | | | | ƒ ^ qF} n  t j t |  ƒ | | | |	 |
 | ƒ } | r| |  } g  | | D]/ } g  | D] \ } } | | | f ^ qÁ^ q´} g  t | | ƒ D] \ } } | | ^ qù} n  | S(   sH  
    Finds position weight matrix matches in DNA sequence. 
    Returns: 
              An array of references to result arrays. There is one result
              array for each matrix, in the same order as the input matrices.
              Each result array is a list of tuples of position and score 
              given as: [(pos1, score1), (pos2, score2) ...]
    Parameters: 
             Obligatory:
                sequence
                    DNA sequence as python string object, containing characters
                    acgtACGT.
                matrices
                    An array of matrices, each represented as a list of four
                    lists of equal length. These lists correspond the
                    frequencies or scores of the nucleotides A, C, G and T,
                    respectively.
                thresholds
                    A number or a list of numbers, used as threshold values for
                    matrix scanning.  If a single number is given, it is used
                    for all matrices; otherwise, there should be as many
                    threshold values as there are matrices.
             Optional:
                bg  
                    Background distribution as an array of four doubles,
                    corresponding to the frequencies of A, C, G and T,
                    respectively. By default the background is estimated from
                    the sequence.
                convert_log_odds
                    If True, assumes that the input matrices are frequency or
                    count matrices, and converts them to log-odds scoring
                    matrices using function count_log_odds; otherwise, treat
                    them as scoring matrices. Default True.
                threshold_from_p
                    If True, assumes that thresholds are p-values and computes
                    the corresponding absolute threshold based on the matrix
                    using function threshold_from_p; otherwise the threshold
                    is used as a hard cut-off. Default True.
                log_base
                    Base for logarithms used in log-odds computations. Relevant
                    if using convert_log_odds=True and threshold_from_p=False.
                    Defaults to natural logarithm if None is given.
                pseudocount
                    Pseudocount used in log-odds conversion and added to
                    sequence symbol counts when estimating the background
                    from sequence. Default 1.
                both_strands
                    Scans against reverse complement sequence in addition to
                    the input sequence. Hits on reverse complement are reported
                    at position [position - sequence_length], which is always
                    negative. The actual hit site for any hit is always
                    seq[pos, pos + matrix_length].
                    Default False.
             Tuning parameters:
                (Optional, do not affect the results, but can give minor
                 speed-ups in some cases. You can pretty much ignore these.)
                algorithm  
                     Selects the algorithm to use for scanning
                        "naive" naive algorithm
                        "pla" permutated lookahead algorithm
                        "supera" super alphabet algorithm. 
                          - Good for long matrices (> 20)
                        "lf" lookahead filtration algorithm. 
                          - Default algorithm in most cases.
                          - Sequence can be searched with multiple matrices 
                            simultaneously. 
                q  
                    An integer, used for fine-tuning "supera" and "lf" algorithms.
                    The default value 7 should be ok pretty much always, but can 
                    be tuned to possibly slightly increase performance. 
                combine
                    True or False, determines whether "lf" algorithm combines all
                    matrices to a single scanning pass. Default True.

    i   s1   Background does not seem to be a list of length 4s:   Number of thresholds does not match the number of matricesi   N(   R   R   t   typet   listt   RuntimeErrort   NoneR   R*   t   xranget   MOODSR#   R   t   _searchR   t   zip(   t   sequencet   matricest
   thresholdsR!   t   convert_log_oddsR#   t   both_strandsR'   t   pseudocountt	   algorithmt   qt   combinet   nt   mR
   R   R(   t	   main_hitst   hitst   post   scoret   reverse_hitst   mainR   (    (    s   ./MOODS/__init__.pyt   searchn   s2    N-#+?'
@/(   t   MOODS._cmoduleR7   R%   R   R   R   R   R   R#   R5   R*   R.   R0   t   Truet   FalseRK   (    (    (    s   ./MOODS/__init__.pyt   <module>   s   		
				
&	