Skip to content
Snippets Groups Projects
Commit ba1e466f authored by GEOFFREY CHARLES BILLY's avatar GEOFFREY CHARLES BILLY
Browse files

Merge branch 'master' into 'master'

Add the bam_to_scidx tool.

See merge request !13
parents a901390c 9b21a38e
No related branches found
No related tags found
No related merge requests found
name: bam_to_scidx
owner: iuc
description: Contains a tool that converts a BAM file to an ScIdx file.
homepage_url:
long_description: |
Contains a tool that converts BAM data to ScIdx, the Strand-specific coordinate count format,
which is used by tools within the Chip-exo Galaxy flavor. The Chip-exo Galaxy flavor is used
by the Center for Eukaryotic Gene Regulation labs at The Pennsylvania State University. ScIdx
files are 1-based.
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/bam_to_scidx
type: unrestricted
categories:
- Convert Formats
File added
"""
bam_to_scidx.py
Input: BAM file
Output: Converted scidx file
"""
import optparse
import os
import shutil
import subprocess
import sys
import tempfile
BUFF_SIZE = 1048576
def get_stderr_exception(tmp_err, tmp_stderr):
"""
Return a stderr string of reasonable size.
"""
tmp_stderr.close()
# Get stderr, allowing for case where it's very large.
tmp_stderr = open(tmp_err, 'rb')
stderr_str = ''
buffsize = BUFF_SIZE
try:
while True:
stderr_str += tmp_stderr.read(buffsize)
if not stderr_str or len(stderr_str) % buffsize != 0:
break
except OverflowError:
pass
tmp_stderr.close()
return stderr_str
def stop_err(msg):
sys.stderr.write(msg)
sys.exit(1)
parser = optparse.OptionParser()
parser.add_option('-j', '--jar_file', dest='jar_file', type='string', help='BAMtoIDX.jar')
parser.add_option('-b', '--input_bam', dest='input_bam', type='string', help='Input dataset in BAM format')
parser.add_option('-i', '--input_bai', dest='input_bai', type='string', help='Input dataset index')
parser.add_option('-r', '--read', dest='read', type='int', default=0, help='Reads.')
parser.add_option('-o', '--output', dest='output', type='string', help='Output dataset in scidx format')
options, args = parser.parse_args()
tmp_dir = tempfile.mkdtemp(prefix='tmp-scidx-')
tmp_out = tempfile.NamedTemporaryFile().name
tmp_stdout = open(tmp_out, 'wb')
tmp_err = tempfile.NamedTemporaryFile().name
tmp_stderr = open(tmp_err, 'wb')
# Link input BAM file and associated index file into the working directory.
input_data_file_name = "input.bam"
os.link(options.input_bam, os.path.join(tmp_dir, input_data_file_name))
os.link(options.input_bai, os.path.join(tmp_dir, "%s.bai" % input_data_file_name))
cmd = "java -jar %s -b %s -s %d -o %s" % (options.jar_file, input_data_file_name, options.read, options.output)
proc = subprocess.Popen(args=cmd, stdout=tmp_stdout, stderr=tmp_stderr, shell=True, cwd=tmp_dir)
return_code = proc.wait()
if return_code != 0:
error_message = get_stderr_exception(tmp_err, tmp_stderr)
stop_err(error_message)
if tmp_dir and os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
<tool id="bam_to_scidx" name="Convert BAM to ScIdx" version="1.0.0">
<description></description>
<command>
<![CDATA[
python $__tool_directory__/bam_to_scidx.py
-j $__tool_directory__/BAMtoIDX.jar
-b "$input_bam"
-i "${input_bam.metadata.bam_index}"
-r $read
-o "$output"
]]>
</command>
<inputs>
<param name="input_bam" type="data" format="bam" label="BAM file" help="BAM file must be sorted and indexed" />
<param name="read" type="select" label="Read to output">
<option value="0" selected="True">Read1</option>
<option value="1">Read2</option>
<option value="2">Combined</option>
</param>
</inputs>
<outputs>
<data name="output" format="scidx" label="${tool.name} on ${on_string}" />
</outputs>
<tests>
<test>
<param name="input" value="input.bam" ftype="bam" />
<param name="read" value="0" />
<output name="output" file="output.scidx" />
</test>
</tests>
<help>
**What it does**
Converts BAM data to ScIdx, the Strand-specific coordinate count format, which is used by
tools within the Chip-exo Galaxy flavor. The Chip-exo Galaxy flavor is used by the Center for
Eukaryotic Gene Regulation labs at The Pennsylvania State University. ScIdx files are 1-based.
</help>
<citations>
<citation type="bibtex">
@unpublished{None,
author = {None},
title = {None},
year = {None},
eprint = {None},
url = {http://www.huck.psu.edu/content/research/independent-centers-excellence/center-for-eukaryotic-gene-regulation}
}</citation>
</citations>
</tool>
File added
#2015-11-23 20:18:56.51;input.bam;READ1
chrom index forward reverse value
chr17 1820100 0 1 1
chr17 1849175 1 0 1
chr17 2778159 0 1 1
chr17 3205518 0 1 1
chr17 12974350 1 0 1
chr17 20624912 0 1 1
chr17 27071176 0 1 1
chr17 34920819 1 0 1
chr17 39810583 0 1 1
chr17 43542379 0 1 1
chr17 45800656 1 0 1
chr17 47124830 0 1 1
chr17 48697995 0 1 1
chr17 54474619 0 1 1
chr17 64901353 0 1 1
chr17 69072332 1 0 1
chr17 76184543 1 0 1
chr17 76518878 0 1 1
chr17 76941015 1 0 1
......@@ -27,7 +27,6 @@ if __name__ == '__main__':
parser.add_argument('--relative_threshold', dest='relative_threshold', type=float, default=0.0, help='Percentage to filter the 95th percentile.')
parser.add_argument('--absolute_threshold', dest='absolute_threshold', type=float, default=0.0, help='Absolute value to filter.')
parser.add_argument('--output_files', dest='output_files', default='simple', help='Restrict output dataset collections.')
parser.add_argument('--sort_chromosome', dest='sort_chromosome', default='asc', help='Sort output by chromosome.')
parser.add_argument('--sort_score', dest='sort_score', default='no', help='Sort output by score.')
parser.add_argument('--statistics_output', dest='statistics_output', help='Statistics output file.')
args = parser.parse_args()
......@@ -50,7 +49,6 @@ if __name__ == '__main__':
args.down_distance,
args.binsize,
args.output_files,
args.sort_chromosome,
args.sort_score)
statistics.extend(stats)
# Accumulate statistics.
......
......@@ -22,7 +22,6 @@
--relative_threshold $threshold_format_cond.relative_threshold
#end if
--output_files $output_files
--sort_chromosome $sort_chromosome
--sort_score $sort_score
--statistics_output "$statistics_output"
]]>
......@@ -56,10 +55,6 @@
<option value="simple_orphan_detail">matched pairs, orphans and details only (D,O,S)</option>
<option value="all">no restrictions (output everything: C,D,F,O,P,S)</option>
</param>
<param name="sort_chromosome" type="select" label="Sort output by chromosomes in" help="Chromosome strings that are not numeric will not be sorted" >
<option value="asc" selected="True">ascending order</option>
<option value="desc">descending order</option>
</param>
<param name="sort_score" type="select" label="Sort output by score?">
<option value="no" selected="True">No</option>
<option value="asc">Yes, in ascending order</option>
......@@ -151,7 +146,6 @@
<param name="threshold_format" value="relative_threshold" />
<param name="relative_threshold" value="0.0" />
<param name="output_files" value="simple" />
<param name="sort_chromosome" value="asc" />
<param name="sort_score" value="asc" />
<output_collection name="closest_S" type="list">
<element name="closest_S_data_1_f0u25d100b1" file="closest_s_output1.gff" ftype="gff" />
......@@ -173,7 +167,6 @@
<param name="threshold_format" value="relative_threshold" />
<param name="relative_threshold" value="0.0" />
<param name="output_files" value="all" />
<param name="sort_chromosome" value="asc" />
<param name="sort_score" value="no" />
<output_collection name="closest_D" type="list">
<element name="closest_D_data_1_f0u50d100b1" file="closest_d_output2.tabular" ftype="tabular" />
......
......@@ -225,7 +225,7 @@ def create_directories(method):
def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
down_distance, binsize, output_files, sort_chromosome, sort_score):
down_distance, binsize, output_files, sort_score):
if method == 'all':
match_methods = METHODS.keys()
else:
......@@ -240,7 +240,6 @@ def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
down_distance,
binsize,
output_files,
sort_chromosome,
sort_score)
statistics.append(stats)
if output_files == 'all' and method == 'all':
......@@ -251,7 +250,7 @@ def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
down_distance, binsize, output_files, sort_chromosome, sort_score):
down_distance, binsize, output_files, sort_score):
output_details = output_files in ["all", "simple_orphan_detail"]
output_plots = output_files in ["all"]
output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"]
......@@ -367,27 +366,12 @@ def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
# Keep track of orphans for statistics.
orphans += len(crick)
# Sort output by chromosome if specified.
if sort_chromosome == "asc":
try:
x.sort(key=lambda data: int(data[3]))
x.sort(key=lambda data: int(data[0]))
except:
# Cannot sort because chromosome number is not a numeric.
pass
elif sort_chromosome == "desc":
try:
x.sort(key=lambda data: int(data[0]), reverse=True)
x.sort(key=lambda data: int(data[3]), reverse=True)
except:
# Cannot sort because chromosome number is not a numeric.
pass
# Sort output by score if specified.
if sort_score == "desc":
x.sort(key=lambda data: float(data[5]), reverse=True)
elif sort_score == "asc":
x.sort(key=lambda data: float(data[5]))
# Writing a summary to txt or gff format file
# Writing a summary to gff format file
for row in x:
row_tmp = list(row)
# Dataset in tuple cannot be modified in Python, so row will
......
......@@ -9,11 +9,6 @@ import tempfile
GFF_EXT = 'gff'
SCIDX_EXT = 'scidx'
ROMAN = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX',
'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX',
'XXX']
def noop(data):
return data
......@@ -27,39 +22,9 @@ def numeric_to_zeropad(data):
return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data)
def roman_to_numeric(data):
def convert(match):
"""
Converts a single roman numeral to a number
"""
numeral = match.group(1)
numeral = numeral.upper()
if numeral not in ROMAN:
# Unable to convert detected Roman numeral
return match.group(0)
return 'chr'+str(ROMAN.index(numeral))+(match.group(2) or '')
r = re.compile('chr([IVX]+)([^IVX]|$)', flags=re.IGNORECASE)
data = r.sub(convert, data)
return data
def numeric_to_roman(data):
def convert(match):
"""
Converts a number to a roman numeral
"""
number = int(match.group(1))
if number >= len(ROMAN):
# Number is out of range to convert to a Roman numeral
return match.group(0)
return 'chr'+ROMAN[number]+(match.group(2) or '')
r = re.compile('chr(\d+)([^\d]|$)')
data = r.sub(convert, data)
return data
FORMATS = ['zeropad', 'numeric', 'roman']
IN_CONVERT = {'zeropad': zeropad_to_numeric, 'roman': roman_to_numeric, 'numeric': noop}
OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'roman': numeric_to_roman, 'numeric': noop}
FORMATS = ['zeropad', 'numeric']
IN_CONVERT = {'zeropad': zeropad_to_numeric, 'numeric': noop}
OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'numeric': noop}
def conversion_functions(in_fmt, out_fmt):
......@@ -69,19 +34,7 @@ def conversion_functions(in_fmt, out_fmt):
return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]]
def autodetect_format(data):
if re.search('chr0\d', data):
fmt = 'zeropad'
elif re.search('chr[IVXivx]', data):
fmt = 'roman'
else:
fmt = 'numeric'
return fmt
def convert_data(data, in_fmt, out_fmt):
if in_fmt == 'autodetect':
in_fmt = autodetect_format(data)
for fn in conversion_functions(in_fmt, out_fmt):
data = fn(data)
return data
......@@ -425,12 +378,12 @@ def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_w
write(cname, '-', peak)
def sort_chromosome_reads_by_index( input_path ):
def sort_chromosome_reads_by_index(input_path):
"""
Return a gff file with chromosome reads sorted by index.
"""
# Will this sort produce different results across platforms?
output_path = tempfile.NamedTemporaryFile( delete=False ).name
output_path = tempfile.NamedTemporaryFile(delete=False).name
command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path)
p = subprocess.Popen(command, shell=True)
p.wait()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment