Updated CWPair2 XML input parameters (WIP) and CWPAIR2_util.

cdf5d6db · GEOFFREY CHARLES BILLY · bf9740cd · cdf5d6db · cdf5d6db · cdf5d6db
Commit cdf5d6db authored 9 years ago by GEOFFREY CHARLES BILLY
--- a/tools/cwpair2/cwpair2.py
+++ b/tools/cwpair2/cwpair2.py
@@ -2,7 +2,7 @@
 cwpair2.py

 Input: gff format
-.gff format: chromosome (chr##), strand (+/-), start (index), end (index), value (read count) 
+.gff format: chromosome (chr##), strand (+/-), start (index), end (index), value (read count)

 Output: list of matched pairs and list of unmatched orphans
 Files: S (simple), D (detailed), O (orphans), P (frequency preview plot), F (final frequency plot), statistics.txt
@@ -15,7 +15,7 @@ import cwpair2_util
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='inputs', action='append', help="Input datasets")
-    parser.add_argument('--mode', action='store', type='string', dest='method', default='mode', help='Method of finding match.')
+    parser.add_argument('--method', action='store', type='string', dest='method', default='mode', help='Method of finding match.')
    parser.add_argument('--up_distance', action='store', type='int', dest='up_distance', default=50, help='Distance upstream of a pair. Default 50.')
    parser.add_argument('--down_distance', action='store', type='int', dest='down_distance', default=100, help='Distance downstream of a pair. Default 100.')
    parser.add_argument('--bin_size', action='store', type='int', dest='binsize', default=1, help='Width of bins for plots and mode.')

--- a/tools/cwpair2/cwpair2.xml
+++ b/tools/cwpair2/cwpair2.xml
 <?xml version="1.0"?>
-<tool id="pughlab_genetrack" name="Genetrack" version="@WRAPPER_VERSION@.0">
-    <description>converts data to standard-peak GFF</description>
+<tool id="pughlab_cwpair2" name="CWPAIR2" version="@WRAPPER_VERSION@.0">
+    <description>matches multiple GFF peak files together</description>
    <macros>
-        <import>genetrack_macros.xml</import>
+        <import>cwpair2_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command>
        <![CDATA[
            mkdir output &&
-            python $__tool_directory__/genetrack.py
-            --input_format $input_format_cond.input_format
-            #if str($input_format_cond.input_format) == "genetracktool":
-                #for $i in $input_format_cond.input_genetracktool:
-                     --input "${i}"
-                #end for
-            #elif str($input_format_cond.input_format) == "gff":
-                #for $i in $input_format_cond.input_gff:
-                     --input "${i}"
-                #end for
+            python $__tool_directory__/cwpair2.py
+            --method $method
+            --down_distance $down_distance
+            --bin_size $bin_size
+            --threshold_format $threshold_format_cond.threshold_format
+            #if str($threshold_format_cond.threshold_format) == "absolute_threshold":
+             	--threshold "${threshold_format_cond.absolute_threshold}"
+            #elif str($threshold_format_cond.threshold_format) == "relative_threshold":
+             	--threshold "${threshold_format_cond.realtive_threshold}"
            #end if
-            --sigma $sigma
-            --exclusion $exclusion
-            --up_width $up_width
-            --down_width $down_width
-            --filter $filter
-            --chromosome $chromosome
-            --chunk_size $chunk_size
-            --output $output
-            --output_format $output_format
        ]]>
    </command>
    <inputs>
-        <param  name="input" type="data" format="gff" multiple="True" label="Convert gff files" />
-        <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads to call peaks" help="Higher values increase computation but produce more smoothing." />
-        <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
-        <param name="up_width" type="integer" value="0" min="0" label="Upstream width of called peaks" />
-        <param name="down_width" type="integer" value="0" min="0" label="Downstream width of called peaks" />
-        <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
-        <param name="chunk_size" type="integer" value="10" min="1" label="Chunk each chromosome into" help="Value is millions of base pairs where each size increment uses about 20MB of memory." />
+    	<conditional name="threshold_format_cond">
+            <param name="threshold_format" type="select" label="Format of the threshold of filtering pairs">
+                <option value="absolute_threshold">Absolute</option>
+                <option value="relative_threshold" selected="True">Relative</option>
+            </param>
+            <when value="absolute_threshold">
+                <param name="absolute_threshold" type="integer" label="Threshold to filter by" />
+            </when>
+            <when value="relative_threshold">
+                <param  name="relative_threshold" type="float" label="Percentile threshold" />
+            </when>
+        </conditional>
+        <param name="method" type="select" label="Method of finding a match">
+            <option value="mode" selected="True">Mode</option>
+            <option value="closest">Closest</option>
+            <option value="largest">Largest</option>
+        </param>
+        <param name="down_distance" type="integer" value="100" min="1" label="Distance downstream of a pair" />
+        <param name="bin_size" type="integer" value="1" min="1" label="Width of bins" />
+        <param name="plot_format" type="string" value="pdf" label="Format of output graph" />
+        <param name="output_format" type="string" value="gff" label="Output format of data" />
+        <param name="chrom_sort" type="string" value="asc" label="Sorting by chromosome"/>
+        <param name="score_sort" type="string" value ="none" label="Sorting by score"/>
        <param name="output_format" type="select" label="Format of converted files">
            <option value="gff" selected="True">Gff</option>
            <option value="genetracktool">GenetrackTool</option>

--- a/tools/cwpair2/cwpair2_util.py
+++ b/tools/cwpair2/cwpair2_util.py
@@ -40,9 +40,9 @@ def parse_chromosomes(reader):
        if len(line) == 9: 
            # gff3 format
            cname, junk, junk, start, end, value, strand, junk, junk = line
-    else: 
-        # txt format
-        cname, strand, start, end, value = line
+        else: 
+            # txt format
+            cname, strand, start, end, value = line
    start = int(start)
    end = int(end)
    value = float(value)
@@ -313,8 +313,10 @@ def perform_process(path, match_func, threshold, options):
        watson, crick = split_strands(chromosome)
        logging.debug('%d watson, %s crick peaks' % (len(watson), len(crick)))
    
-        watson.sort(key=lambda data: -float(data[3])) # Sort by value of each peak
-        crick.sort(key=lambda data: float(data[1])) # Sort by position to facilitate binary search
+        # Sort by value of each peak
+        watson.sort(key=lambda data: -float(data[3])) 
+        # Sort by position to facilitate binary search
+        crick.sort(key=lambda data: float(data[1])) 
 
        keys = make_keys(crick)
        for peak in watson:
@@ -326,8 +328,10 @@ def perform_process(path, match_func, threshold, options):
                match = METHODS[match_func](window, peak)
            
            if match:
-                # Write output
-                # (chr, start+, end+, value+, +, start-, end-, value-, -, dist, coord)
+                '''
+                Write output
+                (chr, start+, end+, value+, +, start-, end-, value-, -, dist, coord)
+                '''
                midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4
                d = distance(peak, match)
                dist.add(d)
@@ -396,11 +400,15 @@ def perform_process(path, match_func, threshold, options):
                

 def write_statistics(statistics):
-    ''' Writes a list of statistics to the file(s) specified by them'''
+    '''
+    Writes a list of statistics to the file(s) specified by them
+    '''
    logging.info('Writing statistics')
    by_file = {}
-    for stats in statistics: # Collect all the stats together by destination file
-        if not stats: # Skip "None" statistics from failed files
+    # Collect all the stats together by destination file
+    for stats in statistics: 
+        # Skip "None" statistics from failed files
+        if not stats: 
            continue
        path = stats['stats_path']
        if path not in by_file: