Skip to content
Snippets Groups Projects
Commit c201391e authored by Julien Cornut's avatar Julien Cornut
Browse files

Automatic Update

parent 4612494e
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# This lib is needed to parse fastq easily # This lib is needed to parse fastq easily
from Bio import SeqIO from Bio import SeqIO
# This lib is used to avoid using a bash magic cell to copy a file # This lib is used to avoid using a bash magic cell to copy a file
from shutil import copyfile from shutil import copyfile
# Used to... list directory # Used to... list directory
from os import listdir, path from os import listdir, path
# Check the time taken by a function to run # Check the time taken by a function to run
import datetime import datetime
# Enter/Uncomment here the name of the file you want to process # Enter/Uncomment here the name of the file you want to process
# fname = "Undetermined_lane7_pair1" # fname = "Undetermined_lane7_pair1"
# fname = "flowcell261_lane8_pair1_CAGATC" # fname = "flowcell261_lane8_pair1_CAGATC"
# fname = "flowcell261_lane8_pair1_TGACCA" # fname = "flowcell261_lane8_pair1_TGACCA"
# fname = "flowcell362_lane4_pair1_ACAGTG" # fname = "flowcell362_lane4_pair1_ACAGTG"
# fname = "flowcell362_lane4_pair1_ACTTGA" # fname = "flowcell362_lane4_pair1_ACTTGA"
# fname = "flowcell362_lane4_pair1_CAGATC" # fname = "flowcell362_lane4_pair1_CAGATC"
# fname = "flowcell362_lane4_pair1_TGACCA" # fname = "flowcell362_lane4_pair1_TGACCA"
# fname = "flowcell362_lane4_pair1_Undetermined" # fname = "flowcell362_lane4_pair1_Undetermined"
# fname = "flowcell384_lane7_pair1_ACAGTG" # fname = "flowcell384_lane7_pair1_ACAGTG"
# fname = "flowcell384_lane7_pair1_ACTTGA" # fname = "flowcell384_lane7_pair1_ACTTGA"
# fname = "flowcell384_lane7_pair1_CAGATC" # fname = "flowcell384_lane7_pair1_CAGATC"
# fname = "flowcell384_lane7_pair1_GATCAG" # fname = "flowcell384_lane7_pair1_GATCAG"
# fname = "flowcell384_lane7_pair1_TGACCA" # fname = "flowcell384_lane7_pair1_TGACCA"
fname = "testing" fname = "testing"
# Print available files in 0-Raws/ directory # Print available files in 0-Raws/ directory
print("\nAvailable files :\n") print("\nAvailable files :\n")
for file in sorted(listdir("0-Raws/")): for file in sorted(listdir("0-Raws/")):
print(" - "+file.split('.')[0]) print(" - "+file.split('.')[0])
``` ```
%% Output %% Output
Available files : Available files :
- Undetermined_lane7_pair1 - Undetermined_lane7_pair1
- flowcell261_lane8_pair1_CAGATC - flowcell261_lane8_pair1_CAGATC
- flowcell261_lane8_pair1_TGACCA - flowcell261_lane8_pair1_TGACCA
- flowcell362_lane4_pair1_ACAGTG - flowcell362_lane4_pair1_ACAGTG
- flowcell362_lane4_pair1_ACTTGA - flowcell362_lane4_pair1_ACTTGA
- flowcell362_lane4_pair1_CAGATC - flowcell362_lane4_pair1_CAGATC
- flowcell362_lane4_pair1_TGACCA - flowcell362_lane4_pair1_TGACCA
- flowcell362_lane4_pair1_Undetermined - flowcell362_lane4_pair1_Undetermined
- flowcell384_lane7_pair1_ACAGTG - flowcell384_lane7_pair1_ACAGTG
- flowcell384_lane7_pair1_ACTTGA - flowcell384_lane7_pair1_ACTTGA
- flowcell384_lane7_pair1_CAGATC - flowcell384_lane7_pair1_CAGATC
- flowcell384_lane7_pair1_GATCAG - flowcell384_lane7_pair1_GATCAG
- flowcell384_lane7_pair1_TGACCA - flowcell384_lane7_pair1_TGACCA
- testing - testing
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from pprint import pprint from pprint import pprint
fn = {n.split('.')[0] for n in listdir("0-Raws/")} fn = {n.split('.')[0] for n in listdir("0-Raws/")}
wn = {n.split('.')[0] for n in listdir("7-WIGs/")} wn = {n.split('.')[0] for n in listdir("7-WIGs/")}
lst = (n+" \t\t Processsed" if n in wn else n for n in sorted(fn)) lst = (n+" \t\t Processsed" if n in wn else n for n in sorted(fn))
[print(l) for l in lst] print("\nProcessed : {0}\n".format(len(fn & wn)))
for l in lst:
print(l)
``` ```
%% Output %% Output
Processed : 5
Undetermined_lane7_pair1 Undetermined_lane7_pair1
flowcell261_lane8_pair1_CAGATC Processsed flowcell261_lane8_pair1_CAGATC Processsed
flowcell261_lane8_pair1_TGACCA Processsed flowcell261_lane8_pair1_TGACCA Processsed
flowcell362_lane4_pair1_ACAGTG Processsed flowcell362_lane4_pair1_ACAGTG Processsed
flowcell362_lane4_pair1_ACTTGA Processsed flowcell362_lane4_pair1_ACTTGA Processsed
flowcell362_lane4_pair1_CAGATC Processsed flowcell362_lane4_pair1_CAGATC Processsed
flowcell362_lane4_pair1_TGACCA flowcell362_lane4_pair1_TGACCA
flowcell362_lane4_pair1_Undetermined flowcell362_lane4_pair1_Undetermined
flowcell384_lane7_pair1_ACAGTG flowcell384_lane7_pair1_ACAGTG
flowcell384_lane7_pair1_ACTTGA flowcell384_lane7_pair1_ACTTGA
flowcell384_lane7_pair1_CAGATC flowcell384_lane7_pair1_CAGATC
flowcell384_lane7_pair1_GATCAG flowcell384_lane7_pair1_GATCAG
flowcell384_lane7_pair1_TGACCA flowcell384_lane7_pair1_TGACCA
testing testing
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
%%bash %%bash
export name="testing" export name="testing"
echo $name echo $name
wc -l 3-Filtered/testing.fastq wc -l 3-Filtered/testing.fastq
wc -l 4-Bowtied/testing.sam wc -l 4-Bowtied/testing.sam
wc -l 5-ncRNA-Removed/testing.fastq wc -l 5-ncRNA-Removed/testing.fastq
``` ```
%% Output %% Output
testing testing
4000 3-Filtered/testing.fastq 4000 3-Filtered/testing.fastq
1415 4-Bowtied/testing.sam 1415 4-Bowtied/testing.sam
1780 5-ncRNA-Removed/testing.fastq 1780 5-ncRNA-Removed/testing.fastq
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
lines_to_burn = 415 lines_to_burn = 415
from time import sleep from time import sleep
with open("3-Filtered/" +fname+".fastq","r") as filtered, \ with open("3-Filtered/" +fname+".fastq","r") as filtered, \
open("4-Bowtied/" +fname+".sam","r") as matches, \ open("4-Bowtied/" +fname+".sam","r") as matches, \
open("5-ncRNA-Removed/"+fname+".fastq","w") as substracted: open("5-ncRNA-Removed/"+fname+".fastq","w") as substracted:
# Iterator over fastq file # Iterator over fastq file
filt_iter = [rec for rec in SeqIO.parse(filtered,"fastq")] filt_iter = [rec for rec in SeqIO.parse(filtered,"fastq")]
matches_iter = [l for l in matches][lines_to_burn:] matches_iter = [l for l in matches][lines_to_burn:]
for _ in range(415): matches.readline() for _ in range(415): matches.readline()
# Check last lines # Check last lines
i = 0 i = 0
im = 0 im = 0
j = 0 j = 0
jm = 0 jm = 0
ma = 0 ma = 0
pas = True pas = True
for f in filt_iter: for f in filt_iter:
i += 1 i += 1
#print() #print()
for m in matches_iter: for m in matches_iter:
m = m.split('\t')[0] m = m.split('\t')[0]
mf = m.split('\t')[9] mf = m.split('\t')[9]
if(m==f.id): if(m==f.id):
if mf==4 if mf==4
ma += 1 ma += 1
im += 1 im += 1
#print("{0}\t{1}".format(f.id,m)) #print("{0}\t{1}".format(f.id,m))
print(i) print(i)
for m in matches_iter: for m in matches_iter:
j += 1 j += 1
m = m.split('\t')[0] m = m.split('\t')[0]
for f in filt_iter: for f in filt_iter:
if(m==f.id): if(m==f.id):
jm += 1 jm += 1
#print("{0}\t{1}".format(f.id,m)) #print("{0}\t{1}".format(f.id,m))
print(i,im,j,jm) print(i,im,j,jm)
# Generator over fastq where the corresponding sam field is 4, # Generator over fastq where the corresponding sam field is 4,
# meaning no reported alignment # meaning no reported alignment
# sub_iter = (rec for rec in filt_iter \ # sub_iter = (rec for rec in filt_iter \
# if matches.readline().split('\t')[1] == '4') # if matches.readline().split('\t')[1] == '4')
# Write back fastq # Write back fastq
# SeqIO.write(sub_iter,substracted,"fastq") # SeqIO.write(sub_iter,substracted,"fastq")
``` ```
%% Output %% Output
1000 1000
1000 1000 1000 1000 1000 1000 1000 1000
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
with open("3-Filtered/" +"testing"+".fastq","r") as filtered, \ with open("3-Filtered/" +"testing"+".fastq","r") as filtered, \
open("4-Bowtied/" +"testing"+".sam","r") as matches, \ open("4-Bowtied/" +"testing"+".sam","r") as matches, \
open("5-ncRNA-Removed/"+"testing"+".fastq","w") as substracted: open("5-ncRNA-Removed/"+"testing"+".fastq","w") as substracted:
# Iterator over fastq file # Iterator over fastq file
filt_iter = SeqIO.parse(filtered,"fastq") filt_iter = SeqIO.parse(filtered,"fastq")
# Strip header (as in original script) # Strip header (as in original script)
for _ in range(415-3): matches.readline() for _ in range(415-3): matches.readline()
# Check last lines # Check last lines
print(matches.readline()) print(matches.readline())
print(matches.readline()) print(matches.readline())
print(matches.readline()) print(matches.readline())
# Generator over fastq where the corresponding sam field is 4, # Generator over fastq where the corresponding sam field is 4,
# meaning no reported alignment # meaning no reported alignment
sub_iter = (rec for rec in filt_iter \ sub_iter = (rec for rec in filt_iter \
if matches.readline().split('\t')[1] == '4') if matches.readline().split('\t')[1] == '4')
# Write back fastq # Write back fastq
SeqIO.write(sub_iter,substracted,"fastq") SeqIO.write(sub_iter,substracted,"fastq")
``` ```
%% Output %% Output
@SQ SN:tW(UCA)Q LN:74 @SQ SN:tW(UCA)Q LN:74
@SQ SN:tY(GUA)Q LN:84 @SQ SN:tY(GUA)Q LN:84
@PG ID:Bowtie VN:1.1.2 CL:"bowtie --wrapper basic-0 -S -v 3 -p 4 --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/testing.fastq 4-Bowtied/testing.sam" @PG ID:Bowtie VN:1.1.2 CL:"bowtie --wrapper basic-0 -S -v 3 -p 4 --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/testing.fastq 4-Bowtied/testing.sam"
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
%%bash %%bash
mkdir -p test/ mkdir -p test/
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment