JeffXiePL · JeffXiePL · Sep 18, 2024 · Sep 17, 2024 · Sep 17, 2024 · JeffXiePL
diff --git a/docs/notebooks/Dseq_Features.ipynb b/docs/notebooks/Dseq_Features.ipynb
diff --git a/docs/notebooks/Example_Gibson.ipynb b/docs/notebooks/Example_Gibson.ipynb
@@ -259,8 +259,8 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[26], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m pcr_product_F2 \u001b[38;5;241m=\u001b[39m pcr(F2_For, F2_Rev, gene_docs[\u001b[38;5;241m0\u001b[39m], limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m)\n\u001b[1;32m      5\u001b[0m pcr_product_F3 \u001b[38;5;241m=\u001b[39m pcr(F3_For, F3_Rev, gene_docs[\u001b[38;5;241m0\u001b[39m], limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m pcr_product_BAC \u001b[38;5;241m=\u001b[39m \u001b[43mpcr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBACF3_Rev\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBACF1_For\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpCC1BAC_docs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m69\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# Printing out the PCR results\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28mprint\u001b[39m(pcr_product_F1\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgb\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n",
-      "File \u001b[0;32m~/Desktop/pydna/src/pydna/amplify.py:523\u001b[0m, in \u001b[0;36mpcr\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    521\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m anneal_primers\u001b[38;5;241m.\u001b[39mproducts[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    522\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(anneal_primers\u001b[38;5;241m.\u001b[39mproducts) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 523\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo PCR product! \u001b[39m\u001b[38;5;132;01m{\u001b[39;00manneal_primers\u001b[38;5;241m.\u001b[39mreport()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    524\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPCR not specific! \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mformat\u001b[39m(anneal_primers\u001b[38;5;241m.\u001b[39mreport())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[5], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m pcr_product_F2 \u001b[38;5;241m=\u001b[39m pcr(F2_For, F2_Rev, gene_docs[\u001b[38;5;241m0\u001b[39m], limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m)\n\u001b[1;32m      5\u001b[0m pcr_product_F3 \u001b[38;5;241m=\u001b[39m pcr(F3_For, F3_Rev, gene_docs[\u001b[38;5;241m0\u001b[39m], limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m pcr_product_BAC \u001b[38;5;241m=\u001b[39m \u001b[43mpcr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBACF1_For\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBACF3_Rev\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpCC1BAC_docs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m69\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# Printing out the PCR results\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28mprint\u001b[39m(pcr_product_F1\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgb\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n",
+      "File \u001b[0;32m~/Documents/OpenSource/summer_internship_2024/pydna/src/pydna/amplify.py:523\u001b[0m, in \u001b[0;36mpcr\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    521\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m anneal_primers\u001b[38;5;241m.\u001b[39mproducts[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    522\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(anneal_primers\u001b[38;5;241m.\u001b[39mproducts) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 523\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo PCR product! \u001b[39m\u001b[38;5;132;01m{\u001b[39;00manneal_primers\u001b[38;5;241m.\u001b[39mreport()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    524\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPCR not specific! \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mformat\u001b[39m(anneal_primers\u001b[38;5;241m.\u001b[39mreport())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
       "\u001b[0;31mValueError\u001b[0m: No PCR product! Template EU140750 8128 bp circular limit=69:\nNo forward primers anneal...\nNo reverse primers anneal..."
      ]
     }
@@ -325,7 +325,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/Example_Restriction.ipynb b/docs/notebooks/Example_Restriction.ipynb
@@ -7,7 +7,11 @@
     "# Example of a Plasmid Restriction/Ligation Cloning\n",
     "> Visit the full library documentation [here](https://bjornfjohansson.github.io/pydna/)\n",
     "\n",
-    "This example showcases a workflow of modelling molecular cloning with restriction enzymes, PCR, and ligases, to clone gene fragments into plasmids. This example constructs a synthetic plasmid by cloning the ase1 gene, which encodes a microtubule associated protein responsible for mitotic spindle assembly, into the pFA6a-kanMX6 cloning vector. The ase1 gene fragment is first cloned from a portion of the Saccharomyces genome through PCR. The pFA6a-kanMX6 cloning vector is then cleaved with AscI and SalI. The ase1 gene fragment is also cleaved with SalI and AscI, and are lastly ligated with the linearized pFA6a-kanMX6 vector.  \n",
+    "This example showcases a workflow of modelling molecular cloning with restriction enzymes, PCR, and ligases, to clone gene fragments into plasmids. This example constructs a synthetic plasmid by cloning the ase1 gene, which encodes a microtubule associated protein responsible for mitotic spindle assembly, into the pFA6a-kanMX6 cloning vector:\n",
+    "\n",
+    "1. The ase1 gene fragment is first cloned from a portion of the _S. pombe_ genome through PCR:\n",
+    "2. The pFA6a-kanMX6 cloning vector is then cleaved with AscI and SalI. The ase1 gene fragment is also cleaved with SalI and AscI\n",
+    "3. The fragment is ligated with the linearized pFA6a-kanMX6 vector.\n",
     "\n",
     "Source files can be found alongside this notebook, if you would like to follow along. Annotations are made alongside the code to describe key steps."
    ]
@@ -537,16 +541,15 @@
    ],
    "source": [
     "# Parsing the files\n",
-    "\n",
     "pFA6akanMX6_path = \"./pFA6a-kanMX6.gb\"\n",
     "ase1_path = \"./CU329670.gb\"\n",
-    "pFA6_docs = parse(pFA6akanMX6_path)\n",
-    "ase1_docs = parse(ase1_path)\n",
+    "vector = parse(pFA6akanMX6_path)[0]\n",
+    "pombe_chromosome_I = parse(ase1_path)[0]\n",
     "\n",
     "# Printing the parsed files\n",
     "\n",
-    "print(pFA6_docs[0].format(\"gb\"))\n",
-    "print(ase1_docs[0].format(\"gb\"))"
+    "print(vector.format(\"gb\"))\n",
+    "print(pombe_chromosome_I.format(\"gb\"))"
    ]
   },
   {
@@ -566,8 +569,14 @@
    "source": [
     "# Generating primers for the ase1 insert fragment. \n",
     "\n",
-    "fwd_primer_ase1 = Dseqrecord(\"ACCATGTCGAC\") + ase1_docs[0][1000:1020] # Adding a SalI cut site\n",
-    "rvs_primer_ase1_3_start = ase1_docs[0][3516:3546] + Dseqrecord(\"GGCGCGCCAT\") # Adding a AscI cut site\n",
+    "#todo-manu: \n",
+    "# 1. find the feature containing the CDS filtering the list of features\n",
+    "# 2. Use the coordinates of the CDS to design primers using pydna design\n",
+    "# 3. Append the cut site to the primers\n",
+    "# 4. Do the PCR\n",
+    "\n",
+    "fwd_primer_ase1 = Dseqrecord(\"ACCATGTCGAC\") + pombe_chromosome_I[1000:1020] # Adding a SalI cut site\n",
+    "rvs_primer_ase1_3_start = pombe_chromosome_I[3516:3546] + Dseqrecord(\"GGCGCGCCAT\") # Adding a AscI cut site\n",
     "rvs_primer_ase1 = rvs_primer_ase1_3_start.reverse_complement()\n",
     "\n",
     "# Printing out the primers\n",
@@ -593,6 +602,10 @@
    "source": [
     "# Checking that the primer Tm are matching\n",
     "\n",
+    "#todo-manu: \n",
+    "# 1. Check the Tm of the primers using the part that aligns with the genome only (here you are calculating the Tm including the restriction sites, which\n",
+    "# will not anneal to the template during the PCR)\n",
+    "\n",
     "print(tm_default(fwd_primer_ase1.seq)) # Modify the primer sequence above retroactively, if Tm not matching.\n",
     "print(tm_default(rvs_primer_ase1.seq))"
    ]
@@ -709,7 +722,7 @@
    "source": [
     "# Performing a PCR to check that the primers are specific. An error message is returned if otherwise.\n",
     "\n",
-    "pcr_product = pcr(fwd_primer_ase1, rvs_primer_ase1, ase1_docs[0])\n",
+    "pcr_product = pcr(fwd_primer_ase1, rvs_primer_ase1, pombe_chromosome_I)\n",
     "\n",
     "# Printing out the PCR results\n",
     "\n",
@@ -733,7 +746,7 @@
    "source": [
     "# Cleaving the cloning vector with restriction enzymes\n",
     "\n",
-    "plamsid_digests = pFA6_docs[0].cut(SalI, AscI)\n",
+    "plamsid_digests = vector.cut(SalI, AscI)\n",
     "\n",
     "# Cleaving the gene fragment with restriction enzymes\n",
     "\n",
@@ -1009,7 +1022,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/Gibson.ipynb b/docs/notebooks/Gibson.ipynb
@@ -9,11 +9,12 @@
     "\n",
     "Gibson Assembly is a powerful method to assemble multiple DNA fragments into a single, continuous sequence in a seamless, one-step reaction. Developed by Daniel Gibson and colleagues in 2009, this method has been widely applied to work in molecular cloning, biotechnology, and synthetic biology.  \n",
     "\n",
-    "`pydna` provides the `Assembly` class to simulate the assembly of DNA sequences. This page provides a guide to performing Gibson Assembly with pre-existing DNA fragments, followed by primer design for generating these fragments via the `pcr` method, if needed.\n",
+    "`pydna` provides the `Assembly` class to simulate the assembly of DNA sequences. Below is an example fpr performing Gibson Assembly with pre-existing DNA fragments, followed by primer design for generating these fragments via the `pcr` method, if needed.\n",
     "\n",
-    "The `Assembly` class simulates Gibson assembly by searching for homologous sequence pairings. The `Assembly` class needs a list of DNA fragments, given in the datatype of `Dseqrecord` objects, and a minimum length of DNA (`limit` parameter) for which a sequence homology is considered.\n",
-    "\n",
-    "The example below shows how to create an `Assembly` object using multiple DNA fragments. `Assembly` takes `Dseqrecord` objects as input that you can directly instantiate (first example), or parse from files (second example). Note that the sequences are always inputted from a 5'-3' direction."
+    "The `Assembly` takes the following arguments:\n",
+    "  * `frags`: list of DNA fragments as `Dseqrecord` objects\n",
+    "  * `limit`: the minimum sequence homology required.\n",
+    "  * `algorithm`: the function used to find homology regions between DNA fragments. For Gibson Assembly, we use the `terminal_overlap` function, which finds homology regions only at the terminal regions. By default, the `Assembly` class uses the `common_sub_strings` function to find homology regions, which finds homology anywhere, as it could happen in a homologous recombination event.\n"
    ]
   },
   {
@@ -29,24 +30,25 @@
       "fragments..: 33bp 34bp 35bp\n",
       "limit(bp)..: 14\n",
       "G.nodes....: 6\n",
-      "algorithm..: common_sub_strings\n"
+      "algorithm..: terminal_overlap\n"
      ]
     }
    ],
    "source": [
     "from pydna.dseqrecord import Dseqrecord\n",
     "from pydna.assembly import Assembly\n",
+    "from pydna.common_sub_strings import terminal_overlap\n",
     "\n",
     "#Creating example Dseqrecord sequences\n",
     "fragment1 = Dseqrecord(\"acgatgctatactgCCCCCtgtgctgtgctcta\")\n",
     "fragment2 = Dseqrecord(\"tgtgctgtgctctaTTTTTtattctggctgtatc\")\n",
     "fragment3 = Dseqrecord(\"tattctggctgtatcGGGGGtacgatgctatactg\")\n",
     "\n",
-    "#Cerating a list of sequences to assemble\n",
+    "#Creating a list of sequences to assemble\n",
     "fragments = [fragment1, fragment2, fragment3]\n",
     "\n",
     "#Performing Gibson assembly, with a minimum shared homology of 14bp\n",
-    "assembly = Assembly(fragments, limit=14)\n",
+    "assembly = Assembly(fragments, limit=14, algorithm=terminal_overlap)\n",
     "\n",
     "#Displaying the assembled product\n",
     "print(assembly)"
@@ -56,9 +58,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The printed output shows the length of each assembled DNA sequenece, the minimum length required for sequence homology search, the number of nodes (number of overlapping regions), and the algorithm used for sequence homology search. Please refer to the full `Assembly` module documentation for more information on the algorithm applied.  \n",
+    "The printed output shows the length of each fragment provided to the assembly, the minimum length required for sequence homology search, the number of nodes (number of overlapping regions), and the algorithm used for sequence homology search. Please refer to the full `Assembly` module documentation for more information on the algorithm applied.\n",
     "\n",
-    "To make a circular sequence from an `Assembly`, pydna provides the `assemble_circular` method. The assembled sequence can be printed as normal, as `Dseqrecord` objects. Note that the `assemble_circular` method returns a list, where the first element (index 0) represents the Watson strand as the top strand, and the second element (index 1) represents the Crick strand as the top strand. "
+    "To make a circular sequence from an `Assembly`, pydna provides the `assemble_circular` method. The assembled sequence can be printed as normal, as `Dseqrecord` objects. Note that the `assemble_circular` method returns a list, where the two elements are reverse complement of each other."
    ]
   },
   {
@@ -81,6 +83,7 @@
       "Dseq(o59)\n",
       "acga..GGGt\n",
       "tgct..CCCa\n",
+      "\n",
       "Dseqrecord\n",
       "circular: True\n",
       "size: 59\n",
@@ -103,6 +106,7 @@
     "\n",
     "#Printing the sequence records\n",
     "print(assembly_circ[0])\n",
+    "print()\n",
     "print(assembly_circ[1])\n"
    ]
   },
@@ -130,7 +134,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/Importing_Seqs.ipynb b/docs/notebooks/Importing_Seqs.ipynb
@@ -7,11 +7,16 @@
     "# Importing and viewing sequence files in pydna\n",
     "> Visit the full library documentation [here](https://bjornfjohansson.github.io/pydna/)\n",
     "\n",
-    "pydna can be used to work with FASTA, Genbank, EMBL, and snapgene files (.fasta, .gb, .embl, .dna). Specifically, pydna provides ways to read these file types, and store them as a record (I.e `Dseqrecord` object) that one can view and work with. Alternatively, pydna can also work with sequences directly passed into python code as a string object, again storing them as a `Dseqrecord` object.\n",
+    "pydna can be used to work with FASTA, Genbank, EMBL, and snapgene files (.fasta, .gb, .embl, .dna). You can read these files into a `Dseqrecord` that one can view and work with. You can also instantiate `Dseqrecord` objects with strings.\n",
     "\n",
     "## Importing Sequence Files\n",
     "\n",
-    "To import files into pydna is simple. pydna provides the `parse` method to read all DNA sequences in a file into a list. As an input, `parse` can take the path to a file from your computer, or a python string with the file content. The following code shows an example of how to use the `parse` function to import a downloaded FASTA file, but other types of files can also be imported in the same way."
+    "To import files into pydna is simple. pydna provides the `parse` method to read all DNA sequences in a file into a list. As an input, `parse` can take:\n",
+    "\n",
+    "* The path to a file from your computer\n",
+    "* A python string with the file content.\n",
+    "\n",
+    "The following code shows an example of how to use the `parse` function to import a FASTA file."
    ]
   },
   {
@@ -34,8 +39,8 @@
    "source": [
     "from pydna.parsers import parse\n",
     "\n",
-    "#Import your file into python. \n",
-    "file_path = \"./sequence.fasta\"\n",
+    "#Import your file into python using its path\n",
+    "file_path = \"./U49845.fasta\"\n",
     "files = parse(file_path)\n",
     "\n",
     "#Show your FASTA file in python\n",
@@ -46,16 +51,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that I used a relative path. Place your sequence file in the same directory as your code to copy the exact code above. \n",
-    "  \n",
-    "The last line of code uses the `format` method to view the imported file in your Python Interpreter (e.g interactive window on Visual Studio Code). Note that `parse` returns a `list` object, hence requiring `[0]` to take the first element of the list. When you have a FASTA file that contains multiple sequences, you can index the list accordingly (e.g  `[0]`, `[1]`, ...)"
+    "Note that `parse` returns a `list` object, hence requiring `[0]` to take the first element of the list. When you have a FASTA file that contains multiple sequences, you can index the list accordingly (e.g  `[0]`, `[1]`, ...)\n",
+    "\n",
+    "The last line of code uses the `format` method to generate a string representation of the sequence as a FASTA file."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Another example, using a complete GenBank file, is shown below. The GenBank file is downloaded [here](https://www.ncbi.nlm.nih.gov/nucleotide/U49845). I've done this on my Mac, and dragged/dropped the sequence.gb file in a series of subfolders on my Desktop. Replace my file path with yours to access the file."
+    "Another example, using a GenBank file ([U49845](https://www.ncbi.nlm.nih.gov/nucleotide/U49845)), is shown below."
    ]
   },
   {
@@ -239,10 +244,10 @@
    "source": [
     "from pydna.parsers import parse\n",
     "\n",
-    "file_path = \"./sequence.gb\"\n",
+    "file_path = \"./U49845.gb\"\n",
     "files = parse(file_path)\n",
     "\n",
-    "#Show your GenBank file in pyton\n",
+    "# Convert the Dseqrecord object into a formatted string in GenBank format\n",
     "files[0].format(\"gb\")\n"
    ]
   },
@@ -339,7 +344,7 @@
     "from Bio.SeqIO import parse as seqio_parse\n",
     "from pydna.dseqrecord import Dseqrecord\n",
     "\n",
-    "file_path = './sequence.gb'\n",
+    "file_path = './U49845.gb'\n",
     "\n",
     "# Extract the first Seqrecord of the SeqIO.parse iterator\n",
     "seq_record = next(seqio_parse(file_path, 'genbank'))\n",
@@ -370,7 +375,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,