Source code for CHiC.tool.hicup_tool

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

from __future__ import print_function

import os
import subprocess
import sys
from utils import logger
import shutil
import tarfile

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
    from pycompss.api.api import compss_wait_on
except ImportError:
    logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
    logger.warn("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import compss_wait_on # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool
from basic_modules.metadata import Metadata

from tool.aligner_utils import alignerUtils
from tool.common import common

##############################################################


[docs]class hicup(Tool): """ Tool to run hicup, from fastq to bam files """ def __init__(self, configuration=None): """ Initialising the function Parameters ---------- configuration: dict """ print("hicup initialising") Tool.__init__(self) if configuration is None: configuration = {} self.configuration.update(configuration)
[docs] def untar_index( # pylint: disable=too-many-locals,too-many-arguments self, genome_file_name, genome_idx, bt2_1_file, bt2_2_file, bt2_3_file, bt2_4_file, bt2_rev1_file, bt2_rev2_file): """ Extracts the Bowtie2 index files from the genome index tar file. Parameters ---------- genome_file_name : str Location string of the genome fasta file genome_idx : str Location of the Bowtie2 index file bt2_1_file : str Location of the <genome>.1.bt2 index file bt2_2_file : str Location of the <genome>.2.bt2 index file bt2_3_file : str Location of the <genome>.3.bt2 index file bt2_4_file : str Location of the <genome>.4.bt2 index file bt2_rev1_file : str Location of the <genome>.rev.1.bt2 index file bt2_rev2_file : str Location of the <genome>.rev.2.bt2 index file Returns ------- bool Boolean indicating if the task was successful """ if "no-untar" in self.configuration and self.configuration["no-untar"] is True: return True gfl = genome_file_name.split("/") au_handle = alignerUtils() au_handle.bowtie2_untar_index( gfl[-1], genome_idx, bt2_1_file, bt2_2_file, bt2_3_file, bt2_4_file, bt2_rev1_file, bt2_rev2_file) return True
[docs] @staticmethod def get_hicup_params(params): """ Function to handle to extraction of commandline parameters and formatting them for use with hicup Parameters ---------- params : dict --bowtie Specify the path to Bowtie --bowtie2 Specify the path to Bowtie 2 --config Specify the configuration file --digest Specify the digest file listing restriction fragment co-ordinates --example Produce an example configuration file --format Specify FASTQ format Options: Sanger, Solexa_Illumina_1.0, Illumina_1.3, Illumina_1.5 --help Print help message and exit --index Path to the relevant reference genome Bowtie/Bowtie2 indices --keep Keep intermediate pipeline files --longest Maximum allowable insert size (bps) --nofill Hi-C protocol did NOT include a fill-in of sticky ends prior to ligation step and therefore FASTQ reads shall be truncated at the Hi-C restriction enzyme cut site (if present) sequence is encountered --outdir Directory to write output files --quiet Suppress progress reports (except warnings) --shortest Minimum allowable insert size (bps) --temp Write intermediate files (i.e. all except summaryfiles and files generated by HiCUP Deduplicator) to a specified directory --threads Specify the number of threads, allowing simultaneous processing of multiple files --version Print the program version and exit --zip Compress output Returns ------- list """ command_params = [] command_parameters = { "hicup_bowtie2_loc": ["--bowtie2", True], "hicup_bowtie_loc": ["--bowtie", True], "hicup_genome_digest" : ["--digest", True], "hicup_index" : ["--index", True], "hicup_longest": ["--longest", True], "hicup_shortest": ["--shortest", True], "hicup_outdir": ["--outdir", True], "hicup_format": ["--format", False], "hicup_keep": ["--keep", False], "hicup_nofill": ["--nofill", False], "hicup_quite": ["--quiet", False], "hicup_temp": ["--temp", False], "hicup_version": ["--version", False], "hicup_zip": ["--zip", False], "hicup_threads": ["--threads", True] } for param in params: if param in command_parameters and params[param] != "None": if command_parameters[param][1]: if command_parameters[param][0] == "--outdir": continue if command_parameters[param][0] == "--bowtie2": continue command_params += [command_parameters[param][0], params[param]] else: if command_parameters[param][0]: command_params += [command_parameters[param][0]] return command_params
#@task(returns=str, # genome_name=IN, # re_enzyme=IN, # genome_loc=FILE_IN, # re_enzyme2=IN)
[docs] def digest_genome(self, genome_name, re_enzyme, genome_loc, re_enzyme2): """ This function takes a genome and digest it using a restriction enzyme specified Parameters ---------- genome_name: str name of the output genome re_enzyme: str name of the enzyme used to cut the genome format example A^GATCT,BglII . genome_loc: str location of the genome in FASTA format re_enzyme2: str Restriction site 2 refers to the second, optional (other DNA shearing techniques such as sonication may be used) enzymatic digestion. This restriction site does NOT form a Hi-C ligation junction. This is the restriction enzyme that is used when the Hi-C sonication protocol is not followed. Typically the sonication protocol is followed. """ if re_enzyme2 == "enzyme2": args = ["hicup_digester", "--genome", genome_name, "--re1", re_enzyme, genome_loc] else: args = ["hicup_digester", "--genome", genome_name, "--re1", re_enzyme, "--re2", re_enzyme2, genome_loc] try: logger.info("hicup_digester CMD: " + " ".join(args)) process = subprocess.Popen( ' '.join(args), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) process.wait() except (IOError, OSError) as msg: logger.fatal("I/O error({0}): {1}\n{2}".format( msg.errno, msg.strerror, args)) files_dir = os.listdir(".") digest_genome = [file_ for file_ in files_dir if \ file_.startswith("Digest_"+genome_name)] return "".join(digest_genome)
[docs] def hicup_alig_filt(self, params, genome_digest, genome_index, genome_loc, fastq1, fastq2, outdir_tar): """ This function aling the HiC read into a reference genome and filter them Parameters ---------- bowtie2_loc: genome_index: str location of genome indexed with bowtie2 digest_genome: str location of genome digested fastq1: str location of fastq2 file fastq2: str location of fastq2 Returns ------- Bool """ folder = os.path.split(outdir_tar)[0]+"/"+ \ os.path.split(outdir_tar)[1].split(".")[0] if os.path.isdir(folder) is False: os.mkdir(folder) index_files = { "1.bt2": genome_loc + ".1.bt2", "2.bt2": genome_loc + ".2.bt2", "3.bt2": genome_loc + ".3.bt2", "4.bt2": genome_loc + ".4.bt2", "rev.1.bt2": genome_loc + ".rev.1.bt2", "rev.2.bt2": genome_loc + ".rev.2.bt2" } logger.progress("Untar Index: "+genome_loc+", "+genome_index) self.untar_index( genome_loc, genome_index, index_files["1.bt2"], index_files["2.bt2"], index_files["3.bt2"], index_files["4.bt2"], index_files["rev.1.bt2"], index_files["rev.2.bt2"] ) hicup_args = [ "hicup", "--index", genome_loc, "--digest", genome_digest, fastq1, fastq2 ] hicup_args = hicup_args + params + ["--bowtie2", "/home/compss/bin/bowtie2", "--outdir", folder] logger.info("arguments for hicup:" + " ".join(hicup_args)) try: process = subprocess.Popen(" ".join(hicup_args), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() logger.info("TARING output folder") tar_file = outdir_tar archive_name = os.path.split(outdir_tar)[1].split(".")[0] onlyfiles = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] tar = tarfile.open(tar_file, "w") for tmp_file in onlyfiles: tar.add( os.path.join(folder, tmp_file), arcname=os.path.join(archive_name, tmp_file) ) os.remove(os.path.join(folder, tmp_file)) tar.close() shutil.rmtree(folder) shutil.move(tar_file, outdir_tar) for indexed_file in index_files: os.remove(index_files[indexed_file]) return True except IOError: return False
[docs] @task(returns=bool, params=IN, genome_digest=FILE_IN, genome_index=FILE_IN, genome_loc=FILE_IN, fastq1=FILE_IN, fastq2=FILE_IN, outdir_tar=FILE_OUT) def hicup_alig_filt_runner(self, params, genome_digest, genome_index, genome_loc, fastq1, fastq2, outdir_tar): """ This function runs the hicup_alig_filt Parameters ---------- bowtie2_loc: genome_index: str location of genome indexed with bowtie2 digest_genome: str location of genome digested fastq1: str location of fastq2 file fastq2: str location of fastq2 Returns ------- Bool """ self.hicup_alig_filt(params, genome_digest, genome_index, genome_loc, fastq1, fastq2, outdir_tar) return True
[docs] def run(self, input_files, metadata, output_files): """ Function that runs and pass the parameters for all the functions Parameters ---------- input_files: dict metadata: dict output_files: dict """ output_dir = os.path.split(output_files["hicup_outdir_tar"])[0] if os.path.isdir(output_dir) is False: os.mkdir(output_dir) if isinstance(self.configuration["hicup_renzyme"], list) is True: re_enzyme = ":".join(self.configuration["hicup_renzyme"]) else: re_enzyme = self.configuration["hicup_renzyme"] if "renzyme_name2" in self.configuration: genome_d = self.digest_genome( self.configuration["genome_name"], re_enzyme, input_files["genome_fa"], self.configuration["renzyme_name2"] ) else: genome_d = self.digest_genome( self.configuration["genome_name"], re_enzyme, input_files["genome_fa"], "enzyme2" ) parameters_hicup = self.get_hicup_params(self.configuration) #if os.path.isdir(self.configuration["hicup_outdir"]) is False: # os.mkdir(self.configuration["hicup_outdir"]) variable = self.hicup_alig_filt(# pylint: disable=too-many-locals,too-many-arguments parameters_hicup, genome_d, input_files["bowtie_gen_idx"], input_files["genome_fa"], input_files["fastq1"], input_files["fastq2"], output_files["hicup_outdir_tar"]) os.remove(genome_d) #variable = compss_wait_on(variable) output_metadata = { "hicup_outdir_tar" : Metadata( data_type="data_CHiC", file_type="TAR", file_path=output_files["hicup_outdir_tar"], sources=[ metadata["genome_fa"].file_path, metadata["fastq1"].file_path, metadata["fastq1"].file_path ], taxon_id=metadata["genome_fa"].taxon_id, meta_data={"tool": "hicup_tool"} ) } return output_files, output_metadata