Commit b5197d24 authored by Laure QUINTRIC's avatar Laure QUINTRIC
Browse files

add trimreads options to allow or not removing primers from reads

parent e2210df6
#!/usr/bin/env bash
#PBS -S /usr/bin/sh
#PBS -q sequentiel
#PBS -l ncpus=1
#PBS -q omp
#PBS -l ncpus=8
#PBS -l mem=20g
#PBS -l walltime=1:00:00
#PBS -m n
......@@ -9,6 +9,6 @@
#load python 3.6
. /appli/bioinfo/python/3.6/env.sh
#run extract script on config file
echo "python $directory/extractR1R2.py --trimreads $trimreads --simply-rename-files $rename --input-R1 $indir/$R1 --input-R2 $indir/$R2 --output-dir $output --barcode $barcode --properties $properties --abyss-sample-name $samplename --tmpdir $tmpdir"
python $directory/extractR1R2.py --trimreads $trimreads --simply-rename-files $rename --input-R1 $indir/$R1 --input-R2 $indir/$R2 --output-dir $output --barcode $barcode --properties $properties --abyss-sample-name $samplename --tmpdir $tmpdir
python $directory/extractR1R2.py --trimreads $trimreads --input-R1 $indir/$R1 --input-R2 $indir/$R2 --output-dir $output --barcode $barcode --properties $properties --tmpdir $tmpdir
cd abyss/abyss-preprocessing
. /appli/bioinfo/python/3.6/delenv.sh
......@@ -28,12 +28,10 @@ def handleArgs():
help="Barcode marker", required=True)
parser.add_argument("--properties", dest="properties",
help="Path to properties file", metavar="FILE", required=True)
parser.add_argument("--abyss-sample-name", dest="abysssamplename",
help="Path to csv file with genoscope and abyss sample name", metavar="FILE", required=True)
parser.add_argument("--tmpdir", dest="tmpdir",
help="Path to temp directory", metavar="FILE", required=True)
parser.add_argument("--trimreads", dest="trimreads", help="Trim reads in cutadapt step", metavar="True/False", required=True)
parser.add_argument("--simply-rename-files", dest="rename", help="No cutadapt, no bbmap, just rename input files with corresponding abyss sample name", metavar="True/False", required=True)
parser.add_argument("--trimreads", dest="trimreads",
help="Trim reads in cutadapt step", metavar="True/False", required=True)
# Arguments initialization
args = parser.parse_args()
......@@ -43,9 +41,7 @@ def handleArgs():
properties = args.properties
barcode = args.barcode
tmpdir = args.tmpdir
abysssamplename = args.abysssamplename
trimreads = eval(args.trimreads)
rename = eval(args.rename)
trimreads = args.trimreads
if not os.path.isfile(inR1):
parser.error("Input file {} does not exist".format(inR1))
......@@ -59,18 +55,16 @@ def handleArgs():
if not os.path.isfile(properties):
parser.error("Properties file does not exit")
sys.exit(1)
if not os.path.isfile(abysssamplename):
parser.error(
"Abyss/Genoscope sample name correspondence file does not exit")
sys.exit(1)
if barcode is None:
parser.error("Barcode marker missing")
sys.exit(1)
if not os.path.isdir(tmpdir):
logging.debug("Tmpdir directory does not exist, going to create it")
os.system("mkdir -p {}".format(tmpdir))
if trimreads:
logging.debug("Going to trimreads at cutadapt step")
return inR1, inR2, outdir, barcode, properties, abysssamplename, tmpdir, trimreads, rename
return inR1, inR2, outdir, barcode, properties, tmpdir, trimreads
# get primers in properties file depending on barcode
......@@ -96,154 +90,70 @@ def getPrimers(barcode, properties):
# Use cutadapt in order to identify forward and reverse reads for each sample
# Then extract R1 and R2 to separate file for each sample
def extractReads(inR1, inR2, outdir, forward, reverse, mismatch, tmpdir, trimreads, rename):
# list all files of the input directory
#put together couples
couples={}
inputs=[inR1, inR2]
for rf in inputs :
# NOTE : working only with files named like CCR_AAEXOSTA_2_1_HGWCMBCX2.12BA308_clean.fastq.gz
#sample=rf.split("_clean")[0]
sample=rf.split(".fastq.gz")[0]
if "_clean" in str(sample) :
sample=sample.replace("_clean","")
logging.debug(
"----------------------------------------------------------------")
logging.debug("FILE {}".format(sample))
# Processing forward reads in file 1_1 2_1
if '_2_1_' in str(rf):
readsname="R1"
genoscopesamplename=sample.replace("_2_1_","_2_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
elif '_1_1_' in str(rf):
readsname="R1"
genoscopesamplename=sample.replace("_1_1_","_1_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
elif '_4_1_' in str(rf):
readsname="R1"
genoscopesamplename=sample.replace("_4_1_","_4_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
elif '_5_1_' in str(rf):
readsname="R1"
genoscopesamplename=sample.replace("_5_1_","_5_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
# Processing reverse reads in file 1_2 2_2
elif '_1_2_' in str(rf):
readsname="R2"
genoscopesamplename=sample.replace("_1_2_","_1_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
elif '_4_2_' in str(rf):
readsname="R2"
genoscopesamplename=sample.replace("_4_2_","_4_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
elif '_5_2_' in str(rf):
readsname="R2"
genoscopesamplename=sample.replace("_5_2_","_5_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
else:
readsname="R2"
genoscopesamplename=sample.replace("_2_2_","_2_")
abyssname = renameSampleFiles(abysssamplename, genoscopesamplename)
if abyssname is None :
logging.debug("No corresponding name find for sample {}".format(sample))
sys.exit(1)
# Rename = copy original files with corresponding abyss sample name
if rename :
os.system("cp {} {}/{}_{}.fastq.gz".format(rf, outdir,os.path.basename(abyssname),readsname))
# if simply rename all files with their abyss sample name, stop here, else process all cutadapt, bbmap...
else :
# putting together couples of files in order to re-pair the reads with bbmap
if abyssname not in couples :
couples[abyssname] = {}
logging.debug("List of couples of files going to be proceed : \n {}".format(couples))
#looking for primers for separating the reads
primers = ["F", "R"]
for primername in primers :
if "F" in primername :
primer=forward
else :
primer=reverse
usedname = "{}_{}{}".format(abyssname, readsname, primername)
logging.debug("ABYSS sample name is {}".format(usedname))
# calculate error rate for primer
erate=round((int(mismatch) / int(len(primer[0])) + 0.01), 2)
minoverlap=int(len(primer[0])) - 1
# Run cutadapt
cutadaptoutname="{}/{}-cutadapt.fastq.gz".format(
outdir, usedname)
if trimreads :
action = "trim"
else :
action = "none"
runCutadapt(trimreads, primer, erate, minoverlap, cutadaptoutname, rf, tmpdir, usedname)
# Rename forward reads with /1 and (as some /2 reads have been detected as forward by cutadapt)
if "F" in primername :
#we are supposed to have only /1 reads in this file, so replace /2 by /1
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/2$/s/\/2/\/1/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(
cutadaptoutname, cutadaptoutname, cutadaptoutname, cutadaptoutname)
else :
#cmd = "gunzip < {} | sed -re 's#^(@[[:alpha:]][[:digit:]]:[[:digit:]]:[[:alnum:]]+:[[:digit:]]:[[:digit:]]+:[[:digit:]]+:[[:digit:]]+.*)/1$#\1/2#g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/1$/s/\/1/\/2/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(
cutadaptoutname, cutadaptoutname, cutadaptoutname, cutadaptoutname)
out = runcmd(cmd)
couples[abyssname]["{}{}".format(readsname,primername)]=cutadaptoutname
# Run bbmap only if not in running in rename mode
if not rename :
for samp, info in couples.items():
# re pair reads with bbmap
singletons="{}/{}_R1F-R2Rsingletons.fastq.gz".format(outdir, samp)
R1repaired=info["R1F"].replace("-cutadapt", "")
R2repaired=info["R2R"].replace("-cutadapt", "")
rePairReads(outdir, R1repaired, R2repaired , singletons, info["R1F"], info["R2R"])
singletons="{}/{}_R2F-R1Rsingletons.fastq.gz".format(outdir, samp)
R1repaired=info["R2F"].replace("-cutadapt", "")
R2repaired=info["R1R"].replace("-cutadapt", "")
rePairReads(outdir, R1repaired, R2repaired , singletons, info["R2F"], info["R1R"])
def runCutadapt(trimreads, primerseq, erate, minoverlap, outfile, infile, tmpdir, outfilename) :
def extractReads(inR1, inR2, outdir, forward, reverse, mismatch, tmpdir, trimreads):
R1F = "R1F"
R1R = "R1R"
R2R = "R2R"
R2F = "R2F"
couplename="{}".format(inR1.split("/")[-1].split(".fastq.gz")[0])
logging.debug("Couple name : {}".format(couplename))
samplename="{}_{}".format(couplename, R1F)
R1cutadaptoutname="{}/{}-cutadapt.fastq.gz".format(outdir, samplename)
runCutadapt(forward, mismatch, R1cutadaptoutname, samplename, inR1, tmpdir, trimreads)
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/2$/s/\/2/\/1/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(R1cutadaptoutname, R1cutadaptoutname, R1cutadaptoutname, R1cutadaptoutname)
out = runcmd(cmd)
samplename="{}_{}".format(couplename, R2R)
R2cutadaptoutname="{}/{}-cutadapt.fastq.gz".format(outdir, samplename)
runCutadapt(reverse, mismatch, R2cutadaptoutname, samplename, inR2, tmpdir, trimreads)
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/1$/s/\/1/\/2/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(R2cutadaptoutname, R2cutadaptoutname, R2cutadaptoutname, R2cutadaptoutname)
out = runcmd(cmd)
singletons="{}/{}_R1F-R2Rsingletons.fastq.gz".format(outdir, couplename)
R1repaired= R1cutadaptoutname.replace("-cutadapt", "")
R2repaired= R2cutadaptoutname.replace("-cutadapt", "")
rePairReads(outdir, R1repaired, R2repaired , singletons, R1cutadaptoutname, R2cutadaptoutname)
samplename="{}_{}".format(couplename, R1R)
R1cutadaptoutname="{}/{}-cutadapt.fastq.gz".format(outdir,samplename)
runCutadapt(reverse, mismatch, R1cutadaptoutname, samplename, inR1, tmpdir, trimreads)
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/2$/s/\/2/\/1/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(R1cutadaptoutname, R1cutadaptoutname, R1cutadaptoutname, R1cutadaptoutname)
out = runcmd(cmd)
samplename="{}_{}".format(couplename, R2F)
R2cutadaptoutname="{}/{}-cutadapt.fastq.gz".format(outdir,samplename)
runCutadapt(forward, mismatch, R2cutadaptoutname, samplename, inR2, tmpdir, trimreads)
cmd = "gunzip < {} | sed -e '/@[A-Z][0-9]:.*\/1$/s/\/1/\/2/g' | gzip -c > {}.tempo.gz ; mv {}.tempo.gz {}".format(R2cutadaptoutname, R2cutadaptoutname, R2cutadaptoutname, R2cutadaptoutname)
out = runcmd(cmd)
singletons="{}/{}_R1R-R2Fsingletons.fastq.gz".format(outdir, couplename)
R1repaired= R1cutadaptoutname.replace("-cutadapt", "")
R2repaired= R2cutadaptoutname.replace("-cutadapt", "")
rePairReads(outdir, R1repaired, R2repaired , singletons, R1cutadaptoutname, R2cutadaptoutname)
def runCutadapt(primerseq, mismatch, outfile, samplename, infile, tmpdir, trimreads) :
logging.debug("Going to run cutadapt on file {}".format(infile))
if trimreads :
action = "trim"
else :
action = "none"
param = ""
erate = ""
minoverlap = ""
for item in primerseq :
param += "-g {} ".format(str(item))
erate=round((int(mismatch) / int(len(item)) + 0.01), 2)
minoverlap=int(len(item)) - 1
if param is not None :
cmd="bash; . /appli/bioinfo/cutadapt/1.18/env.sh ; cutadapt --action={} -j $NCPUS {} -e {} -O {} --minimum-length=1 --max-n=0 --discard-untrimmed -o {} {} > {}/cutadapt_{}.log; . /appli/bioinfo/cutadapt/1.18/delenv.sh".format(action, param, erate, minoverlap, outfile, infile, tmpdir, outfilename)
cmd=". /appli/bioinfo/cutadapt/1.18/env.sh ; cutadapt --action={} -j $NCPUS {} -e {} -O {} --minimum-length=1 --max-n=0 --discard-untrimmed -o {} {} &> {}/cutadapt_{}.log;".format(action, param, erate, minoverlap, outfile, infile, tmpdir, samplename)
out=runcmd(cmd)
else :
logging.debug("CUTADAPT ERROR while treating file {}, exiting programme".format(infile))
sys.exit(1)
def renameSampleFiles(abysssamplename, genoscopesamplename):
filelist=[]
logging.debug("ABYSS SAMPLE NAME = {}, GENOSCOPE SAMPLE NAME = {}".format(abysssamplename,genoscopesamplename))
with open(abysssamplename, newline='') as csvfile:
csvreader=csv.reader(csvfile, delimiter=';')
for row in csvreader:
#if genoscope file name in first column match the treated sample
#return the abyss name
if os.path.basename(genoscopesamplename) in row[0]:
return row[1]
def rePairReads(outdir, R1repaired, R2repaired, singletons, R1file, R2file):
# repair reads files generated with cutadapt in order have the same number of
# reads in R1 and R2 files.
......@@ -251,7 +161,7 @@ def rePairReads(outdir, R1repaired, R2repaired, singletons, R1file, R2file):
# Using bbmap/repair script
# BBMAP cmd
cmd="bash; . /appli/bioinfo/bbmap/38.22/env.sh ; repair.sh overwrite=t in1={} in2={} out1={} out2={} outs={} repair ; . /appli/bioinfo/bbmap/38.22/delenv.sh".format(
cmd=". /appli/bioinfo/bbmap/38.22/env.sh ; repair.sh overwrite=t in1={} in2={} out1={} out2={} outs={} repair;".format(
R1file, R2file, R1repaired, R2repaired, singletons)
out=runcmd(cmd)
......@@ -263,7 +173,6 @@ def rePairReads(outdir, R1repaired, R2repaired, singletons, R1file, R2file):
logging.debug("Removing /2 from file {} : {}".format(R1repaired, cmd))
out=runcmd(cmd)
def runcmd(cmd):
logging.debug("Running process....")
p=subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
......@@ -284,46 +193,32 @@ def runcmd(cmd):
if __name__ == '__main__':
# Handle script arguments
inR1, inR2, outdir, barcode, properties, abysssamplename, tmpdir, trimreads, rename =handleArgs()
inR1, inR2, outdir, barcode, properties, tmpdir, trimreads=handleArgs()
# logging config
st = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logfile='{}/extractR1R2_{}_{}.log'.format(tmpdir,inR1.split(".fastq.gz")[0].split("/")[-1:][0], st)
logging.basicConfig(level=logging.DEBUG, filename=logfile, format='%(asctime)s %(levelname)s %(message)s', filemode='w')
if trimreads:
logging.debug("Going to trimreads at cutadapt step")
if rename:
logging.debug("Going to simply rename files with their corresponding names")
logging.debug(
"Extract forward and reverse reads from Genoscope raw sequencing data")
"Extract forward and reverse reads from paired-end sequencing data")
logging.debug("input R1 = {}".format(inR1))
logging.debug("input R2 = {}".format(inR2))
logging.debug("output directory = {}".format(outdir))
logging.debug("tmp and log directory = {}".format(tmpdir))
logging.debug("barcode = {}".format(barcode))
logging.debug("properties = {}".format(properties))
logging.debug("sample name csv file = {}".format(abysssamplename))
#We don't need to get primers if we just want to rename input files
if not rename :
# get forward and reverse primers
forward, reverse, mismatch = getPrimers(barcode, properties)
logging.debug("Primer forward {} = {}".format(barcode, forward))
logging.debug("Primer reverse {} = {}".format(barcode, reverse))
else :
forward = None
reverse = None
mismatch = None
# get forward and reverse primers
forward, reverse, mismatch = getPrimers(barcode, properties)
logging.debug("Primer forward {} = {}".format(barcode, forward))
logging.debug("Primer reverse {} = {}".format(barcode, reverse))
# Extract Forward and Reverse Reads according to Primers (using cutadapt)
extractReads(inR1, inR2, outdir, forward, reverse, mismatch, tmpdir, trimreads, rename)
extractReads(inR1, inR2, outdir, forward, reverse, mismatch, tmpdir, trimreads)
# We cannot create frogs archive if samples have not been parsed
if not rename :
#create frogs archive
os.system("mkdir -p {}/frogs; cd {}; tar cvf frogs/samples.tar *_{{R1F,R2R,R2F,R1R}}.fastq.gz".format(outdir,outdir))
#create frogs archive
os.system("mkdir -p {}/frogs; cd {}; tar cvf frogs/samples.tar *_{{R1F,R2R,R2F,R1R}}.fastq.gz".format(outdir,outdir))
logging.debug("End of processing")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment