Skip to content
Snippets Groups Projects
Commit c201391e authored by Julien Cornut's avatar Julien Cornut
Browse files

Automatic Update

parent 4612494e
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
# This lib is needed to parse fastq easily
from Bio import SeqIO
# This lib is used to avoid using a bash magic cell to copy a file
from shutil import copyfile
# Used to... list directory
from os import listdir, path
# Check the time taken by a function to run
import datetime
# Enter/Uncomment here the name of the file you want to process
# fname = "Undetermined_lane7_pair1"
# fname = "flowcell261_lane8_pair1_CAGATC"
# fname = "flowcell261_lane8_pair1_TGACCA"
# fname = "flowcell362_lane4_pair1_ACAGTG"
# fname = "flowcell362_lane4_pair1_ACTTGA"
# fname = "flowcell362_lane4_pair1_CAGATC"
# fname = "flowcell362_lane4_pair1_TGACCA"
# fname = "flowcell362_lane4_pair1_Undetermined"
# fname = "flowcell384_lane7_pair1_ACAGTG"
# fname = "flowcell384_lane7_pair1_ACTTGA"
# fname = "flowcell384_lane7_pair1_CAGATC"
# fname = "flowcell384_lane7_pair1_GATCAG"
# fname = "flowcell384_lane7_pair1_TGACCA"
fname = "testing"
# Print available files in 0-Raws/ directory
print("\nAvailable files :\n")
for file in sorted(listdir("0-Raws/")):
print(" - "+file.split('.')[0])
```
%% Output
Available files :
- Undetermined_lane7_pair1
- flowcell261_lane8_pair1_CAGATC
- flowcell261_lane8_pair1_TGACCA
- flowcell362_lane4_pair1_ACAGTG
- flowcell362_lane4_pair1_ACTTGA
- flowcell362_lane4_pair1_CAGATC
- flowcell362_lane4_pair1_TGACCA
- flowcell362_lane4_pair1_Undetermined
- flowcell384_lane7_pair1_ACAGTG
- flowcell384_lane7_pair1_ACTTGA
- flowcell384_lane7_pair1_CAGATC
- flowcell384_lane7_pair1_GATCAG
- flowcell384_lane7_pair1_TGACCA
- testing
%% Cell type:code id: tags:
``` python
from pprint import pprint
fn = {n.split('.')[0] for n in listdir("0-Raws/")}
wn = {n.split('.')[0] for n in listdir("7-WIGs/")}
lst = (n+" \t\t Processsed" if n in wn else n for n in sorted(fn))
[print(l) for l in lst]
print("\nProcessed : {0}\n".format(len(fn & wn)))
for l in lst:
print(l)
```
%% Output
Processed : 5
Undetermined_lane7_pair1
flowcell261_lane8_pair1_CAGATC Processsed
flowcell261_lane8_pair1_TGACCA Processsed
flowcell362_lane4_pair1_ACAGTG Processsed
flowcell362_lane4_pair1_ACTTGA Processsed
flowcell362_lane4_pair1_CAGATC Processsed
flowcell362_lane4_pair1_TGACCA
flowcell362_lane4_pair1_Undetermined
flowcell384_lane7_pair1_ACAGTG
flowcell384_lane7_pair1_ACTTGA
flowcell384_lane7_pair1_CAGATC
flowcell384_lane7_pair1_GATCAG
flowcell384_lane7_pair1_TGACCA
testing
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
%% Cell type:code id: tags:
``` python
%%bash
export name="testing"
echo $name
wc -l 3-Filtered/testing.fastq
wc -l 4-Bowtied/testing.sam
wc -l 5-ncRNA-Removed/testing.fastq
```
%% Output
testing
4000 3-Filtered/testing.fastq
1415 4-Bowtied/testing.sam
1780 5-ncRNA-Removed/testing.fastq
%% Cell type:code id: tags:
``` python
lines_to_burn = 415
from time import sleep
with open("3-Filtered/" +fname+".fastq","r") as filtered, \
open("4-Bowtied/" +fname+".sam","r") as matches, \
open("5-ncRNA-Removed/"+fname+".fastq","w") as substracted:
# Iterator over fastq file
filt_iter = [rec for rec in SeqIO.parse(filtered,"fastq")]
matches_iter = [l for l in matches][lines_to_burn:]
for _ in range(415): matches.readline()
# Check last lines
i = 0
im = 0
j = 0
jm = 0
ma = 0
pas = True
for f in filt_iter:
i += 1
#print()
for m in matches_iter:
m = m.split('\t')[0]
mf = m.split('\t')[9]
if(m==f.id):
if mf==4
ma += 1
im += 1
#print("{0}\t{1}".format(f.id,m))
print(i)
for m in matches_iter:
j += 1
m = m.split('\t')[0]
for f in filt_iter:
if(m==f.id):
jm += 1
#print("{0}\t{1}".format(f.id,m))
print(i,im,j,jm)
# Generator over fastq where the corresponding sam field is 4,
# meaning no reported alignment
# sub_iter = (rec for rec in filt_iter \
# if matches.readline().split('\t')[1] == '4')
# Write back fastq
# SeqIO.write(sub_iter,substracted,"fastq")
```
%% Output
1000
1000 1000 1000 1000
%% Cell type:code id: tags:
``` python
with open("3-Filtered/" +"testing"+".fastq","r") as filtered, \
open("4-Bowtied/" +"testing"+".sam","r") as matches, \
open("5-ncRNA-Removed/"+"testing"+".fastq","w") as substracted:
# Iterator over fastq file
filt_iter = SeqIO.parse(filtered,"fastq")
# Strip header (as in original script)
for _ in range(415-3): matches.readline()
# Check last lines
print(matches.readline())
print(matches.readline())
print(matches.readline())
# Generator over fastq where the corresponding sam field is 4,
# meaning no reported alignment
sub_iter = (rec for rec in filt_iter \
if matches.readline().split('\t')[1] == '4')
# Write back fastq
SeqIO.write(sub_iter,substracted,"fastq")
```
%% Output
@SQ SN:tW(UCA)Q LN:74
@SQ SN:tY(GUA)Q LN:84
@PG ID:Bowtie VN:1.1.2 CL:"bowtie --wrapper basic-0 -S -v 3 -p 4 --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/testing.fastq 4-Bowtied/testing.sam"
%% Cell type:code id: tags:
``` python
%%bash
mkdir -p test/
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment