{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# This notebooks shows how you build the parsed data from all the files from the config files\n", "\n", "This will be of use when you:\n", "1. are using some genome coordinates not provided in [here](https://www.dropbox.com/home/Her%2CHsuan-Lin%20Charlene/data)\n", "2. you need some other features built-in\n", "3. include non-canonical transcripts\n", "\n", "Prerequsites:\n", "1. make sure files in `config/COORD.ini` are correctly specified.\n", "\n", "# Step 1: choose the coordinates" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "please set the right config according to genome coordinate\n", "Using /home/hsher/gencode_coords/GRCh38.p13.genome.fa\n", "Using HG38 by default\n", "Using /tscc/nfs/home/hsher/GRCh38.primary_assembly.genome.fa.gz\n" ] } ], "source": [ "from pybedtools import BedTool\n", "import pickle\n", "import pandas as pd\n", "import os\n", "import sys\n", "import metadensity as md\n", "from pathlib import Path\n", "\n", "if not os.path.isfile('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini'):\n", " raise Exception(\"Config file does not exist. Create one by following https://github.com/algaebrown/Metadensity\") \n", "md.settings.from_config_file('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 2: check if basic file exists" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/tscc/nfs/home/hsher/annotations/gencode.v40.primary_assembly.annotation.gff3.gz\n" ] } ], "source": [ "# load gencode files, and also downloaded branchpoint files\n", "# coord = BedTool(md.settings.gencode_feature_fname)\n", "# transcripts = BedTool(md.settings.transcript_fname)\n", "gencode = BedTool(md.settings.gencode_fname)\n", "print(md.settings.gencode_fname)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 3: read branchpoints files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "found 59328 branchpoints in experiment\n" ] } ], "source": [ "# sequencing branchpoints\n", "if os.path.isfile(md.settings.branchpoint_fname):\n", " branchpoint = BedTool(md.settings.branchpoint_fname)\n", " print(f'found {len(branchpoint)} branchpoints in experiment')\n", "else:\n", " branchpoint = None\n", " print(f'No branchpoint file provided in config file: {md.settings.branchpoint_fname}')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "found 367672 branchpoints in ML prediction\n" ] } ], "source": [ "# machine learning branchpoints\n", "if os.path.isfile(md.settings.branchpoint_pred_fname):\n", " branchpoint_pred = pd.read_csv(md.settings.branchpoint_pred_fname)\n", " \n", " # make to an interval\n", " branchpoint_pred['start'] = branchpoint_pred['start']-1 # 1\n", " branchpoint_pred_bed = BedTool.from_dataframe(branchpoint_pred[['chromosome', 'start', 'end', 'exon_id', 'exon_number', 'strand']])\n", " print(f'found {len(branchpoint_pred_bed)} branchpoints in ML prediction')\n", "else:\n", " branchpoint_pred_bed = None\n", " print(f'No ML branchpoint file provided in config file: {md.settings.branchpoint_pred_fname}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 4: read polyA files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# some criteria, see https://polyasite.unibas.ch/atlas\n", "included_polyA_type = ['TE', 'EX', 'IN', 'DS']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# make type specific annotation\n", "def extract_polyA_signal_coordinates(subset_polyadf):\n", " ''' extracting the coordinate of polyA signal from polya dataframe'''\n", " # extract polya signals\n", " signal_coord = []\n", " for index, row in subset_polyadf.iterrows():\n", " if type(row['polyasignal'])==str:\n", " polyasignals = row['polyasignal'].split(';')\n", " for sig in polyasignals:\n", " motif, rela_pos, obs_pos = sig.split('@')\n", " signal_coord.append([row['chrom'],int(obs_pos),int(obs_pos)+1, \n", " row['name'], row['polyatype'], row['strand']])\n", "\n", " # make into bed\n", " polysignal_df = pd.DataFrame(signal_coord,\n", " columns= ['chrom', 'start', 'end', 'name', 'score', 'strand'])\n", " polyasignal_bed = BedTool.from_dataframe(polysignal_df)\n", "\n", " return polyasignal_bed\n", "def polyAtype_specific_coords(polyAtype, polyadf):\n", " ''' \n", " \n", " create polyA related feature for specific types of polyA\n", " \n", " polyAtype: can be TE, EX, IN, DS.. see https://polyasite.unibas.ch/atlas\n", " polyadf: pd.DataFrame from the tsv file\n", " return:\n", " poly_site_bed: BedTool object with polyA site annotation\n", " polya_signal_bed: BedTool object with polyA signals\n", " \n", " '''\n", " # filter for specific types\n", " subset_polyadf = polyadf.loc[polyadf['polyatype']==polyAtype]\n", " \n", " # create bed of polyA sites\n", " polya_site_bed = BedTool.from_dataframe(subset_polyadf)\n", " polya_signal_bed = extract_polyA_signal_coordinates(subset_polyadf)\n", " \n", " return polya_site_bed, polya_signal_bed" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "no polyA annotation from file, \n" ] } ], "source": [ "if os.path.isfile(md.settings.polya_fname):\n", " polyadf = pd.read_csv(os.path.join(md.settings.polya_fname), \n", " sep = '\\t', \n", " header = None, \n", " names = ['chrom', 'start', 'end', 'name', 'average expression', 'strand', 'perc_sample','n_sample', 'avg_tpm', 'polyatype', 'polyasignal'])\n", " polyadf['chrom'] = 'chr'+polyadf['chrom'].astype(str)\n", " \n", "# # filter for specific types\n", "# polydf = polyadf.loc[polyadf['polyatype'].isin(included_polyA_type)]\n", " \n", "# # make into bed\n", "# polya = BedTool.from_dataframe(polyadf)\n", " polya, polyasignal_bed = polyAtype_specific_coords('TE', polyadf = polyadf)\n", " \n", " print(f'found {len(polya)} polyA sites, {len(polyasignal_bed)} polyA signals')\n", "else:\n", " polya = None\n", " polyasignal_bed = None\n", " print(f'no polyA annotation from file, {md.settings.polya_fname}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# What Metadensity need is a dictionary pickled, saved in the data/ folder.\n", "Like in the `build_transcript_dict` function.\n", "\n", "The dictionary contains several levels\n", "- First level: ensembl ID -> dictionary\n", "- Second level: (1 gene/1 transcript) -> information including chrom, start, end, strand and feature\n", "- Third level: feature dictionary {'feature names':-> set() of (start, end) tuples}\n", "\n", "To add custom features names, you will need to modify the coorisponding Metagene child class.\n", "For example, you want to add `polyA_site` sites to a mRNA, then you need to add `polyA_site` to the `self.featnames` of the `class Protein_coding_gene(Metagene)`.\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 5: organize all features into a gene/transcript-based dictionary" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# these functions may need to be modified depending on what file format your are inputting\n", "def add_feature(all_dict, feature_bed, coords_with_id, name):\n", " ''' update external feature into transcript/gene centric annotations'''\n", " overlap_transcript = feature_bed.intersect(coords_with_id, wb = True, s = True).saveas()\n", " \n", " for feat in overlap_transcript:\n", " transcript_id = feat[-1].split('transcript_id=')[1].split(';')[0]\n", " gene_id = feat[-1].split('gene_id=')[1].split(';')[0]\n", " \n", " for id_ in [transcript_id, gene_id]:\n", " if name not in all_dict[id_]['features'].keys():\n", "\n", " all_dict[id_]['features'][name] = set()\n", " \n", " all_dict[id_]['features'][name].update([feat.start]) # POINT FEATURE ONLY!\n", " \n", "def first_last_exon_cds(all_dict):\n", " ''' Designate the most 5' CDS/exon as first_CDS/exon, the most 3' CDS/exon as last_CDS/exon\n", " '''\n", " \n", " for key in all_dict.keys():\n", " features = all_dict[key]['features']\n", " \n", " #### exon\n", " if 'exon' in features.keys():\n", " min_start = min([e[0] for e in list(features['exon'])])\n", " max_start = max([e[1] for e in list(features['exon'])])\n", " \n", " if all_dict[key]['strand'] == '+':\n", " features['first_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])\n", " features['last_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])\n", " else:\n", " features['last_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])\n", " features['first_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])\n", " \n", " features['exon'] = features['exon'] - features['first_exon'] - features['last_exon']\n", " else:\n", " #del all_dict[key] # no exon, no need to keep\n", " pass\n", " ## CDS\n", " if 'CDS' in features.keys():\n", " min_start = min([e[0] for e in list(features['CDS'])])\n", " max_start = max([e[1] for e in list(features['CDS'])])\n", " \n", " if all_dict[key]['strand'] == '+':\n", " features['first_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])\n", " features['last_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])\n", " else:\n", " features['last_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])\n", " features['first_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])\n", " \n", " features['CDS'] = features['CDS'] - features['first_CDS'] - features['last_CDS']\n", " \n", "def five_three_utr(all_dict):\n", " ''' hg19 annotations don't have UTR. Designate 5 prime and 3 prime UTR'''\n", " for key in all_dict.keys():\n", " features = all_dict[key]['features']\n", " \n", " #### exon\n", " if 'UTR' in features.keys():\n", " min_start = min([e[0] for e in list(features['UTR'])])\n", " max_start = max([e[1] for e in list(features['UTR'])])\n", " \n", " if all_dict[key]['strand'] == '+':\n", " features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])\n", " features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])\n", " else:\n", " features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])\n", " features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])\n", " \n", " del features['UTR']\n", "\n", "\n", " \n", "def build_transcript_dict(gencode = gencode, outdir = md.settings.datadir,\n", " branchpoint = branchpoint, branchpoint_pred_bed = branchpoint_pred_bed,\n", " polya = polya, polyasignal_bed = polyasignal_bed):\n", " ''' extract gencode coordinate and save in data\n", " \n", " gencode: BedTool, gencode.v33.annotations.gff3\n", " outdir: md.settings.datadir\n", " \n", " branchpoint: BedTool\n", " branchpoint_pred: BedTool\n", " polya: BedTool\n", " polyasignal_bed: BedTool\n", " '''\n", " \n", " # make a directory for every coordinate\n", " annotation_path = outdir\n", " if not Path(annotation_path).is_file():\n", " Path(annotation_path).mkdir(parents=True, exist_ok = True)\n", " \n", " all_dict = {}\n", " \n", " # filter for transcripts\n", " transcript = gencode.filter(lambda x: x[2] == 'transcript').saveas()\n", " exons = gencode.filter(lambda x: x[2] == 'exon').saveas()\n", " intron = transcript.subtract(exons, s = True).saveas()\n", " \n", " print('extracting from gencode annotations')\n", " for g in gencode:\n", " feature_type = g[2]\n", " \n", " if feature_type == 'transcript' or feature_type == 'gene':\n", " # start a new dict\n", " all_dict[g.attrs['ID']] = {} # ENST or ENSG\n", " all_dict[g.attrs['ID']]['chrom'] = g.chrom\n", " all_dict[g.attrs['ID']]['start'] = g.start\n", " all_dict[g.attrs['ID']]['end'] = g.end\n", " all_dict[g.attrs['ID']]['strand'] = g.strand\n", " all_dict[g.attrs['ID']]['id'] = g.attrs['ID']\n", " all_dict[g.attrs['ID']]['type'] = g.attrs['gene_type']\n", " all_dict[g.attrs['ID']]['name'] = g.attrs['gene_name']\n", " all_dict[g.attrs['ID']]['features'] = {} # start to contain stuffs\n", " else:\n", " transcript_id = g.attrs['transcript_id'] # doesn't always equal to ID, for X,Y chromosome genes\n", " gene_id = g.attrs['gene_id']\n", " \n", " for ids in [transcript_id, gene_id]:\n", " target_dict = all_dict[ids]\n", " if feature_type not in target_dict['features'].keys():\n", " target_dict['features'][feature_type] = set()\n", " target_dict['features'][feature_type].update([(g.start, g.end)])\n", " \n", " print('building intron')\n", " for i in intron:\n", " feature_type = 'intron'\n", " transcript_id = i.attrs['transcript_id']\n", " gene_id = i.attrs['gene_id']\n", " \n", " for ids in [transcript_id, gene_id]:\n", " target_dict = all_dict[ids]\n", " if feature_type not in target_dict['features'].keys():\n", " target_dict['features'][feature_type] = set()\n", " target_dict['features'][feature_type].update([(i.start, i.end)])\n", " \n", " print('building first last exon/cds')\n", " first_last_exon_cds(all_dict)\n", " \n", " print('building five/three prime UTR')\n", " five_three_utr(all_dict)\n", " \n", " print('building branchpoint')\n", " if branchpoint is not None:\n", " add_feature(all_dict, branchpoint, intron, name = 'branchpoint') # experimental\n", " if branchpoint_pred_bed is not None:\n", " add_feature(all_dict, branchpoint_pred_bed, intron, name = 'branchpoint_pred')\n", " \n", " \n", " print('building polyA')\n", " if polya is not None:\n", " add_feature(all_dict, polya, transcript, name = 'polyAsite') # experimental\n", " if polyasignal_bed is not None:\n", " add_feature(all_dict, polyasignal_bed, transcript, name = 'polyAsignal') # experimental\n", " \n", " print('writing to directory')\n", " with open(os.path.join(annotation_path, 'gencode'), 'wb') as f:\n", " pickle.dump(all_dict, f)\n", " return all_dict" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:\n", "GL000009.2\tENSEMBL\texon\t56140\t58376\t.\t-\t.\tID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical\n", "\n", "***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:\n", "GL000009.2\tENSEMBL\texon\t56140\t58376\t.\t-\t.\tID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "extracting from gencode annotations\n", "building intron\n", "building first last exon/cds\n", "building five/three prime UTR\n", "building branchpoint\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n", "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n", "\n", "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n", "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n", "\n", "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n", "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n", "\n", "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n", "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "building polyA\n", "writing to directory\n" ] } ], "source": [ "d = build_transcript_dict()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'chrom': 'chr1',\n", " 'start': 11868,\n", " 'end': 14409,\n", " 'strand': '+',\n", " 'id': 'ENSG00000223972.5',\n", " 'type': 'transcribed_unprocessed_pseudogene',\n", " 'name': 'DDX11L1',\n", " 'features': {'exon': {(12009, 12057),\n", " (12178, 12227),\n", " (12612, 12697),\n", " (12612, 12721),\n", " (12974, 13052),\n", " (13220, 13374),\n", " (13452, 13670)},\n", " 'intron': {(12227, 12612), (12721, 12974), (13052, 13220)},\n", " 'first_exon': {(11868, 12227)},\n", " 'last_exon': {(13220, 14409)},\n", " 'branchpoint_pred': {12950, 13193, 13201}}}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# take a look at what's inside\n", "d[list(d.keys())[0]] #" ] } ], "metadata": { "kernelspec": { "display_name": "metadensity", "language": "python", "name": "metadensity" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 4 }