Automatic Update

c201391e · Julien Cornut · 4612494e · c201391e
Commit c201391e authored 9 years ago by Julien Cornut
--- a/PlotData.ipynb
+++ b/PlotData.ipynb
@@ -69,7 +69,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 157,
+   "execution_count": 163,
   "metadata": {
    "collapsed": false,
    "scrolled": true
@@ -79,6 +79,9 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "\n",
+      "Processed : 5\n",
+      "\n",
      "Undetermined_lane7_pair1\n",
      "flowcell261_lane8_pair1_CAGATC \t\t Processsed\n",
      "flowcell261_lane8_pair1_TGACCA \t\t Processsed\n",
@@ -94,29 +97,6 @@
      "flowcell384_lane7_pair1_TGACCA\n",
      "testing\n"
     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None]"
-      ]
-     },
-     "execution_count": 157,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
@@ -127,7 +107,10 @@
    "\n",
    "lst = (n+\" \\t\\t Processsed\" if n in wn else n for n in sorted(fn))\n",
    "\n",
-    "[print(l) for l in lst]\n",
+    "print(\"\\nProcessed : {0}\\n\".format(len(fn & wn)))\n",
+    "\n",
+    "for l in lst:\n",
+    "    print(l)\n",
    "\n",
    "    \n"
   ]

 %% Cell type:code id: tags:

 ``` python
 # This lib is needed to parse fastq easily
 from Bio import SeqIO

 # This lib is used to avoid using a bash magic cell to copy a file
 from shutil import copyfile

 # Used to... list directory
 from os import listdir, path

 # Check the time taken by a function to run
 import datetime

 # Enter/Uncomment here the name of the file you want to process

 # fname = "Undetermined_lane7_pair1"
 # fname = "flowcell261_lane8_pair1_CAGATC"
 # fname = "flowcell261_lane8_pair1_TGACCA"
 # fname = "flowcell362_lane4_pair1_ACAGTG"
 # fname = "flowcell362_lane4_pair1_ACTTGA"
 # fname = "flowcell362_lane4_pair1_CAGATC"
 # fname = "flowcell362_lane4_pair1_TGACCA"
 # fname = "flowcell362_lane4_pair1_Undetermined"
 # fname = "flowcell384_lane7_pair1_ACAGTG"
 # fname = "flowcell384_lane7_pair1_ACTTGA"
 # fname = "flowcell384_lane7_pair1_CAGATC"
 # fname = "flowcell384_lane7_pair1_GATCAG"
 # fname = "flowcell384_lane7_pair1_TGACCA"
 fname = "testing"

 # Print available files in 0-Raws/ directory
 print("\nAvailable files :\n")
 for file in sorted(listdir("0-Raws/")):
    print(" - "+file.split('.')[0])
 ```

 %% Output

    
    Available files :
    
     - Undetermined_lane7_pair1
     - flowcell261_lane8_pair1_CAGATC
     - flowcell261_lane8_pair1_TGACCA
     - flowcell362_lane4_pair1_ACAGTG
     - flowcell362_lane4_pair1_ACTTGA
     - flowcell362_lane4_pair1_CAGATC
     - flowcell362_lane4_pair1_TGACCA
     - flowcell362_lane4_pair1_Undetermined
     - flowcell384_lane7_pair1_ACAGTG
     - flowcell384_lane7_pair1_ACTTGA
     - flowcell384_lane7_pair1_CAGATC
     - flowcell384_lane7_pair1_GATCAG
     - flowcell384_lane7_pair1_TGACCA
     - testing

 %% Cell type:code id: tags:

 ``` python
 from pprint import pprint

 fn = {n.split('.')[0] for n in listdir("0-Raws/")}
 wn = {n.split('.')[0] for n in listdir("7-WIGs/")}

 lst = (n+" \t\t Processsed" if n in wn else n for n in sorted(fn))

-[print(l) for l in lst]
+print("\nProcessed : {0}\n".format(len(fn & wn)))
+
+for l in lst:
+    print(l)


 ```

 %% Output

+    
+    Processed : 5
+    
    Undetermined_lane7_pair1
    flowcell261_lane8_pair1_CAGATC 		 Processsed
    flowcell261_lane8_pair1_TGACCA 		 Processsed
    flowcell362_lane4_pair1_ACAGTG 		 Processsed
    flowcell362_lane4_pair1_ACTTGA 		 Processsed
    flowcell362_lane4_pair1_CAGATC 		 Processsed
    flowcell362_lane4_pair1_TGACCA
    flowcell362_lane4_pair1_Undetermined
    flowcell384_lane7_pair1_ACAGTG
    flowcell384_lane7_pair1_ACTTGA
    flowcell384_lane7_pair1_CAGATC
    flowcell384_lane7_pair1_GATCAG
    flowcell384_lane7_pair1_TGACCA
    testing

-    [None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None,
-     None]
-
 %% Cell type:code id: tags:

 ``` python
 %%bash

 export name="testing"

 echo $name

 wc -l 3-Filtered/testing.fastq
 wc -l 4-Bowtied/testing.sam
 wc -l 5-ncRNA-Removed/testing.fastq
 ```

 %% Output

    testing
    4000 3-Filtered/testing.fastq
    1415 4-Bowtied/testing.sam
    1780 5-ncRNA-Removed/testing.fastq

 %% Cell type:code id: tags:

 ``` python
 lines_to_burn = 415

 from time import sleep

 with open("3-Filtered/"     +fname+".fastq","r") as filtered, \
     open("4-Bowtied/"      +fname+".sam","r")   as matches,  \
     open("5-ncRNA-Removed/"+fname+".fastq","w") as substracted:



    # Iterator over fastq file
    filt_iter = [rec for rec in SeqIO.parse(filtered,"fastq")]
    matches_iter = [l for l in matches][lines_to_burn:]

    for _ in range(415): matches.readline()
    # Check last lines

    i = 0
    im = 0
    j = 0
    jm = 0
    ma = 0

    pas = True
    for f in filt_iter:
        i += 1
        #print()
        for m in matches_iter:
            m = m.split('\t')[0]
            mf = m.split('\t')[9]
            if(m==f.id):
                if mf==4
                    ma += 1
                im += 1
                #print("{0}\t{1}".format(f.id,m))


    print(i)

    for m in matches_iter:
        j += 1
        m = m.split('\t')[0]
        for f in filt_iter:
            if(m==f.id):
                jm += 1
                #print("{0}\t{1}".format(f.id,m))


    print(i,im,j,jm)




    # Generator over fastq where the corresponding sam field is 4,
    # meaning no reported alignment
    # sub_iter = (rec for rec in filt_iter \
    #            if matches.readline().split('\t')[1] == '4')

    # Write back fastq
    # SeqIO.write(sub_iter,substracted,"fastq")
 ```

 %% Output

    1000
    1000 1000 1000 1000

 %% Cell type:code id: tags:

 ``` python
 with open("3-Filtered/"     +"testing"+".fastq","r") as filtered, \
     open("4-Bowtied/"      +"testing"+".sam","r")   as matches,  \
     open("5-ncRNA-Removed/"+"testing"+".fastq","w") as substracted:

    # Iterator over fastq file
    filt_iter = SeqIO.parse(filtered,"fastq")

    # Strip header (as in original script)
    for _ in range(415-3): matches.readline()

    # Check last lines
    print(matches.readline())
    print(matches.readline())
    print(matches.readline())

    # Generator over fastq where the corresponding sam field is 4,
    # meaning no reported alignment
    sub_iter = (rec for rec in filt_iter \
                if matches.readline().split('\t')[1] == '4')

    # Write back fastq
    SeqIO.write(sub_iter,substracted,"fastq")
 ```

 %% Output

    @SQ	SN:tW(UCA)Q	LN:74
    
    @SQ	SN:tY(GUA)Q	LN:84
    
    @PG	ID:Bowtie	VN:1.1.2	CL:"bowtie --wrapper basic-0 -S -v 3 -p 4 --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/testing.fastq 4-Bowtied/testing.sam"
    

 %% Cell type:code id: tags:

 ``` python
 %%bash
 mkdir -p test/
 ```