Created
November 9, 2020 10:58
-
-
Save alimanfoo/5b3e86966ac02787723df28c53c19334 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# AgamP4 resource in GCS\n", | |
| "\n", | |
| "A quick guide to accessing AgamP4 reference genome and genome annotations on GCS." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Reference genome - read from GCS\n", | |
| "\n", | |
| "The reference genome is available in zarr format and can be read directly from GCS..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import zarr\n", | |
| "import fsspec" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<zarr.hierarchy.Group '/'>" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "genome_path_gcs = 'gs://vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.zarr'\n", | |
| "genome_store = fsspec.get_mapper(genome_path_gcs)\n", | |
| "genome = zarr.open_consolidated(genome_store)\n", | |
| "genome" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "/\n", | |
| " ├── 2L (49364325,) |S1\n", | |
| " ├── 2R (61545105,) |S1\n", | |
| " ├── 3L (41963435,) |S1\n", | |
| " ├── 3R (53200684,) |S1\n", | |
| " ├── Mt (15363,) |S1\n", | |
| " ├── UNKN (42389979,) |S1\n", | |
| " ├── X (24393108,) |S1\n", | |
| " └── Y_unplaced (237045,) |S1\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(genome.tree())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "49364325" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(genome['2L'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<zarr.core.Array '/2L' (49364325,) |S1>" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "genome['2L']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([b'a', b'a', b'c', ..., b'a', b'a', b'a'], dtype='|S1')" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "seq = genome['2L'][:]\n", | |
| "seq" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Reference genome - local download\n", | |
| "\n", | |
| "You can also download the reference genome locally if you prefer..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "--2020-11-09 10:57:01-- https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz\n", | |
| "Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.191.128, 173.194.192.128, 209.85.146.128, ...\n", | |
| "Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.191.128|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 80872688 (77M) [application/gzip]\n", | |
| "Saving to: ‘Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz’\n", | |
| "\n", | |
| "Anopheles-gambiae-P 100%[===================>] 77.13M 111MB/s in 0.7s \n", | |
| "\n", | |
| "2020-11-09 10:57:02 (111 MB/s) - ‘Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz’ saved [80872688/80872688]\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!wget --no-clobber https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!gunzip Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pyfasta" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<pyfasta.fasta.Fasta at 0x7f269a72d750>" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# don't forget the key_fn argument\n", | |
| "genome = pyfasta.Fasta('Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa', key_fn=lambda x: x.split()[0])\n", | |
| "genome" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Genome annotations - read from GCS\n", | |
| "\n", | |
| "Genome annotations (last version produced before vectorbase migrated to eupathdb) can be read directly from GCS via petl..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import petl as etl\n", | |
| "import petlx" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "geneset_path_gcs = 'gs://vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class='petl'>\n", | |
| "<thead>\n", | |
| "<tr>\n", | |
| "<th>seqid</th>\n", | |
| "<th>source</th>\n", | |
| "<th>type</th>\n", | |
| "<th>start</th>\n", | |
| "<th>end</th>\n", | |
| "<th>score</th>\n", | |
| "<th>strand</th>\n", | |
| "<th>phase</th>\n", | |
| "<th>attributes</th>\n", | |
| "</tr>\n", | |
| "</thead>\n", | |
| "<tbody>\n", | |
| "<tr>\n", | |
| "<td>2L</td>\n", | |
| "<td>VectorBase</td>\n", | |
| "<td>chromosome</td>\n", | |
| "<td style='text-align: right'>1</td>\n", | |
| "<td style='text-align: right'>49364325</td>\n", | |
| "<td>.</td>\n", | |
| "<td>.</td>\n", | |
| "<td>.</td>\n", | |
| "<td>{'ID': '2L', 'Alias': 'CM000356.1'}</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>2L</td>\n", | |
| "<td>VectorBase</td>\n", | |
| "<td>gene</td>\n", | |
| "<td style='text-align: right'>157348</td>\n", | |
| "<td style='text-align: right'>186936</td>\n", | |
| "<td>.</td>\n", | |
| "<td>-</td>\n", | |
| "<td>.</td>\n", | |
| "<td>{'ID': 'AGAP004677', 'biotype': 'protein_coding', 'description': 'methylenetetrahydrofolate dehydrogenase(NAD ) / 5,10-methenyltetrahydrofolate [Source:VB Community Annotation]', 'version': '1'}</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>2L</td>\n", | |
| "<td>VectorBase</td>\n", | |
| "<td>mRNA</td>\n", | |
| "<td style='text-align: right'>157348</td>\n", | |
| "<td style='text-align: right'>181305</td>\n", | |
| "<td>.</td>\n", | |
| "<td>-</td>\n", | |
| "<td>.</td>\n", | |
| "<td>{'ID': 'AGAP004677-RA', 'Parent': 'AGAP004677', 'Dbxref': 'Celera_Pep:agCP1943,KEGG_Enzyme:00670 1.5.1.5 3.5.4.9,KEGG_Enzyme:00720 1.5.1.5 3.5.4.9,RefSeq:XM_001687731.1,RefSeq:XP_001687783.1,STRING:7165.AGAP004677-PA,UniParc:UPI0000020060,UniProtKB:A7UTF7,NCBI_GP:EDO64016.1', 'Ontology_term': 'GO:0003824,GO:0004477,GO:0004487,GO:0004488,GO:0035999,GO:0046653,GO:0055114', 'biotype': 'protein_coding', 'version': '1'}</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>2L</td>\n", | |
| "<td>VectorBase</td>\n", | |
| "<td>three_prime_UTR</td>\n", | |
| "<td style='text-align: right'>157348</td>\n", | |
| "<td style='text-align: right'>157495</td>\n", | |
| "<td>.</td>\n", | |
| "<td>-</td>\n", | |
| "<td>.</td>\n", | |
| "<td>{'Parent': 'AGAP004677-RA'}</td>\n", | |
| "</tr>\n", | |
| "<tr>\n", | |
| "<td>2L</td>\n", | |
| "<td>VectorBase</td>\n", | |
| "<td>exon</td>\n", | |
| "<td style='text-align: right'>157348</td>\n", | |
| "<td style='text-align: right'>157623</td>\n", | |
| "<td>.</td>\n", | |
| "<td>-</td>\n", | |
| "<td>.</td>\n", | |
| "<td>{'Parent': 'AGAP004677-RA', 'Name': 'AGAP004677-RB-E4', 'constitutive': '1', 'rank': '4'}</td>\n", | |
| "</tr>\n", | |
| "</tbody>\n", | |
| "</table>\n", | |
| "<p><strong>...</strong></p>" | |
| ], | |
| "text/plain": [ | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "| seqid | source | type | start | end | score | strand | phase | attributes |\n", | |
| "+=======+==============+===================+========+==========+=======+========+=======+====================================================================================================================================================================================================================================================================================================================================================================================================================================+\n", | |
| "| '2L' | 'VectorBase' | 'chromosome' | 1 | 49364325 | '.' | '.' | '.' | {'ID': '2L', 'Alias': 'CM000356.1'} |\n", | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "| '2L' | 'VectorBase' | 'gene' | 157348 | 186936 | '.' | '-' | '.' | {'ID': 'AGAP004677', 'biotype': 'protein_coding', 'description': 'methylenetetrahydrofolate dehydrogenase(NAD ) / 5,10-methenyltetrahydrofolate [Source:VB Community Annotation]', 'version': '1'} |\n", | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "| '2L' | 'VectorBase' | 'mRNA' | 157348 | 181305 | '.' | '-' | '.' | {'ID': 'AGAP004677-RA', 'Parent': 'AGAP004677', 'Dbxref': 'Celera_Pep:agCP1943,KEGG_Enzyme:00670 1.5.1.5 3.5.4.9,KEGG_Enzyme:00720 1.5.1.5 3.5.4.9,RefSeq:XM_001687731.1,RefSeq:XP_001687783.1,STRING:7165.AGAP004677-PA,UniParc:UPI0000020060,UniProtKB:A7UTF7,NCBI_GP:EDO64016.1', 'Ontology_term': 'GO:0003824,GO:0004477,GO:0004487,GO:0004488,GO:0035999,GO:0046653,GO:0055114', 'biotype': 'protein_coding', 'version': '1'} |\n", | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "| '2L' | 'VectorBase' | 'three_prime_UTR' | 157348 | 157495 | '.' | '-' | '.' | {'Parent': 'AGAP004677-RA'} |\n", | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "| '2L' | 'VectorBase' | 'exon' | 157348 | 157623 | '.' | '-' | '.' | {'Parent': 'AGAP004677-RA', 'Name': 'AGAP004677-RB-E4', 'constitutive': '1', 'rank': '4'} |\n", | |
| "+-------+--------------+-------------------+--------+----------+-------+--------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", | |
| "..." | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tbl_geneset = etl.fromgff3(geneset_path_gcs)\n", | |
| "tbl_geneset" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "If you prefer to use pandas, here's how to make a dataframe..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>seqid</th>\n", | |
| " <th>source</th>\n", | |
| " <th>type</th>\n", | |
| " <th>start</th>\n", | |
| " <th>end</th>\n", | |
| " <th>score</th>\n", | |
| " <th>strand</th>\n", | |
| " <th>phase</th>\n", | |
| " <th>ID</th>\n", | |
| " <th>Name</th>\n", | |
| " <th>Parent</th>\n", | |
| " <th>biotype</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>chromosome</td>\n", | |
| " <td>1</td>\n", | |
| " <td>49364325</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " <td>2L</td>\n", | |
| " <td>None</td>\n", | |
| " <td>None</td>\n", | |
| " <td>None</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>gene</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>186936</td>\n", | |
| " <td>.</td>\n", | |
| " <td>-</td>\n", | |
| " <td>.</td>\n", | |
| " <td>AGAP004677</td>\n", | |
| " <td>None</td>\n", | |
| " <td>None</td>\n", | |
| " <td>protein_coding</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>mRNA</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>181305</td>\n", | |
| " <td>.</td>\n", | |
| " <td>-</td>\n", | |
| " <td>.</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>None</td>\n", | |
| " <td>AGAP004677</td>\n", | |
| " <td>protein_coding</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>three_prime_UTR</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>157495</td>\n", | |
| " <td>.</td>\n", | |
| " <td>-</td>\n", | |
| " <td>.</td>\n", | |
| " <td>None</td>\n", | |
| " <td>None</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>None</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>exon</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>157623</td>\n", | |
| " <td>.</td>\n", | |
| " <td>-</td>\n", | |
| " <td>.</td>\n", | |
| " <td>None</td>\n", | |
| " <td>AGAP004677-RB-E4</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>None</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " seqid source type start end score strand phase \\\n", | |
| "0 2L VectorBase chromosome 1 49364325 . . . \n", | |
| "1 2L VectorBase gene 157348 186936 . - . \n", | |
| "2 2L VectorBase mRNA 157348 181305 . - . \n", | |
| "3 2L VectorBase three_prime_UTR 157348 157495 . - . \n", | |
| "4 2L VectorBase exon 157348 157623 . - . \n", | |
| "\n", | |
| " ID Name Parent biotype \n", | |
| "0 2L None None None \n", | |
| "1 AGAP004677 None None protein_coding \n", | |
| "2 AGAP004677-RA None AGAP004677 protein_coding \n", | |
| "3 None None AGAP004677-RA None \n", | |
| "4 None AGAP004677-RB-E4 AGAP004677-RA None " | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_geneset = (\n", | |
| " tbl_geneset\n", | |
| " # choose the attributes you want to include as columns\n", | |
| " .unpackdict('attributes', ['ID', 'Name', 'Parent', 'biotype'])\n", | |
| " .todataframe()\n", | |
| ")\n", | |
| "df_geneset.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Note that scikit-allel has a `gff3_to_dataframe` function, but this does not support reading directly from cloud at the moment (PR welcome)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Genome annotations - local download" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "--2020-11-09 10:57:42-- https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz\n", | |
| "Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.194.128, 64.233.191.128, 142.250.125.128, ...\n", | |
| "Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.194.128|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 2724130 (2.6M) [application/gzip]\n", | |
| "Saving to: ‘Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz’\n", | |
| "\n", | |
| "Anopheles-gambiae-P 100%[===================>] 2.60M --.-KB/s in 0.02s \n", | |
| "\n", | |
| "2020-11-09 10:57:42 (158 MB/s) - ‘Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz’ saved [2724130/2724130]\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!wget --no-clobber https://storage.googleapis.com/vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import allel" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>seqid</th>\n", | |
| " <th>source</th>\n", | |
| " <th>type</th>\n", | |
| " <th>start</th>\n", | |
| " <th>end</th>\n", | |
| " <th>score</th>\n", | |
| " <th>strand</th>\n", | |
| " <th>phase</th>\n", | |
| " <th>ID</th>\n", | |
| " <th>Name</th>\n", | |
| " <th>Parent</th>\n", | |
| " <th>biotype</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>chromosome</td>\n", | |
| " <td>1</td>\n", | |
| " <td>49364325</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>.</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>2L</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>gene</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>186936</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>-</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>AGAP004677</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " <td>protein_coding</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>mRNA</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>181305</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>-</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>.</td>\n", | |
| " <td>AGAP004677</td>\n", | |
| " <td>protein_coding</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>three_prime_UTR</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>157495</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>-</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>.</td>\n", | |
| " <td>.</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>.</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2L</td>\n", | |
| " <td>VectorBase</td>\n", | |
| " <td>exon</td>\n", | |
| " <td>157348</td>\n", | |
| " <td>157623</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>-</td>\n", | |
| " <td>-1</td>\n", | |
| " <td>.</td>\n", | |
| " <td>AGAP004677-RB-E4</td>\n", | |
| " <td>AGAP004677-RA</td>\n", | |
| " <td>.</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " seqid source type start end score strand phase \\\n", | |
| "0 2L VectorBase chromosome 1 49364325 -1 . -1 \n", | |
| "1 2L VectorBase gene 157348 186936 -1 - -1 \n", | |
| "2 2L VectorBase mRNA 157348 181305 -1 - -1 \n", | |
| "3 2L VectorBase three_prime_UTR 157348 157495 -1 - -1 \n", | |
| "4 2L VectorBase exon 157348 157623 -1 - -1 \n", | |
| "\n", | |
| " ID Name Parent biotype \n", | |
| "0 2L . . . \n", | |
| "1 AGAP004677 . . protein_coding \n", | |
| "2 AGAP004677-RA . AGAP004677 protein_coding \n", | |
| "3 . . AGAP004677-RA . \n", | |
| "4 . AGAP004677-RB-E4 AGAP004677-RA . " | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_geneset = allel.gff3_to_dataframe('Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz',\n", | |
| " attributes=['ID', 'Name', 'Parent', 'biotype'])\n", | |
| "df_geneset.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment