{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# This notebooks shows how you build the parsed data from all the files from the config files\n",
    "\n",
    "This will be of use when you:\n",
    "1. are using some genome coordinates not provided in [here](https://www.dropbox.com/home/Her%2CHsuan-Lin%20Charlene/data)\n",
    "2. you need some other features built-in\n",
    "3. include non-canonical transcripts\n",
    "\n",
    "Prerequsites:\n",
    "1. make sure files in `config/COORD.ini` are correctly specified.\n",
    "\n",
    "# Step 1: choose the coordinates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "please set the right config according to genome coordinate\n",
      "Using /home/hsher/gencode_coords/GRCh38.p13.genome.fa\n",
      "Using HG38 by default\n",
      "Using /tscc/nfs/home/hsher/GRCh38.primary_assembly.genome.fa.gz\n"
     ]
    }
   ],
   "source": [
    "from pybedtools import BedTool\n",
    "import pickle\n",
    "import pandas as pd\n",
    "import os\n",
    "import sys\n",
    "import metadensity as md\n",
    "from pathlib import Path\n",
    "\n",
    "if not os.path.isfile('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini'):\n",
    "    raise Exception(\"Config file does not exist. Create one by following https://github.com/algaebrown/Metadensity\") \n",
    "md.settings.from_config_file('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 2: check if basic file exists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/tscc/nfs/home/hsher/annotations/gencode.v40.primary_assembly.annotation.gff3.gz\n"
     ]
    }
   ],
   "source": [
    "# load gencode files, and also downloaded branchpoint files\n",
    "# coord = BedTool(md.settings.gencode_feature_fname)\n",
    "# transcripts = BedTool(md.settings.transcript_fname)\n",
    "gencode = BedTool(md.settings.gencode_fname)\n",
    "print(md.settings.gencode_fname)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 3: read branchpoints files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found 59328 branchpoints in experiment\n"
     ]
    }
   ],
   "source": [
    "# sequencing branchpoints\n",
    "if os.path.isfile(md.settings.branchpoint_fname):\n",
    "    branchpoint = BedTool(md.settings.branchpoint_fname)\n",
    "    print(f'found {len(branchpoint)} branchpoints in experiment')\n",
    "else:\n",
    "    branchpoint = None\n",
    "    print(f'No branchpoint file provided in config file: {md.settings.branchpoint_fname}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found 367672 branchpoints in ML prediction\n"
     ]
    }
   ],
   "source": [
    "# machine learning branchpoints\n",
    "if os.path.isfile(md.settings.branchpoint_pred_fname):\n",
    "    branchpoint_pred = pd.read_csv(md.settings.branchpoint_pred_fname)\n",
    "    \n",
    "    # make to an interval\n",
    "    branchpoint_pred['start'] = branchpoint_pred['start']-1 # 1\n",
    "    branchpoint_pred_bed = BedTool.from_dataframe(branchpoint_pred[['chromosome', 'start', 'end', 'exon_id', 'exon_number', 'strand']])\n",
    "    print(f'found {len(branchpoint_pred_bed)} branchpoints in ML prediction')\n",
    "else:\n",
    "    branchpoint_pred_bed = None\n",
    "    print(f'No ML branchpoint file provided in config file: {md.settings.branchpoint_pred_fname}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 4: read polyA files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some criteria, see https://polyasite.unibas.ch/atlas\n",
    "included_polyA_type = ['TE', 'EX', 'IN', 'DS']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make type specific annotation\n",
    "def extract_polyA_signal_coordinates(subset_polyadf):\n",
    "    ''' extracting the coordinate of polyA signal from polya dataframe'''\n",
    "    # extract polya signals\n",
    "    signal_coord = []\n",
    "    for index, row in subset_polyadf.iterrows():\n",
    "        if type(row['polyasignal'])==str:\n",
    "            polyasignals = row['polyasignal'].split(';')\n",
    "            for sig in polyasignals:\n",
    "                motif, rela_pos, obs_pos = sig.split('@')\n",
    "                signal_coord.append([row['chrom'],int(obs_pos),int(obs_pos)+1, \n",
    "                                     row['name'], row['polyatype'], row['strand']])\n",
    "\n",
    "    # make into bed\n",
    "    polysignal_df = pd.DataFrame(signal_coord,\n",
    "                                columns= ['chrom', 'start', 'end', 'name', 'score', 'strand'])\n",
    "    polyasignal_bed = BedTool.from_dataframe(polysignal_df)\n",
    "\n",
    "    return polyasignal_bed\n",
    "def polyAtype_specific_coords(polyAtype, polyadf):\n",
    "    ''' \n",
    "    \n",
    "    create polyA related feature for specific types of polyA\n",
    "    \n",
    "    polyAtype: can be TE, EX, IN, DS.. see https://polyasite.unibas.ch/atlas\n",
    "    polyadf: pd.DataFrame from the tsv file\n",
    "    return:\n",
    "        poly_site_bed: BedTool object with polyA site annotation\n",
    "        polya_signal_bed: BedTool object with polyA signals\n",
    "    \n",
    "    '''\n",
    "    # filter for specific types\n",
    "    subset_polyadf = polyadf.loc[polyadf['polyatype']==polyAtype]\n",
    "    \n",
    "    # create bed of polyA sites\n",
    "    polya_site_bed = BedTool.from_dataframe(subset_polyadf)\n",
    "    polya_signal_bed = extract_polyA_signal_coordinates(subset_polyadf)\n",
    "    \n",
    "    return polya_site_bed, polya_signal_bed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no polyA annotation from file, \n"
     ]
    }
   ],
   "source": [
    "if os.path.isfile(md.settings.polya_fname):\n",
    "    polyadf = pd.read_csv(os.path.join(md.settings.polya_fname), \n",
    "                sep = '\\t', \n",
    "                header = None, \n",
    "                names = ['chrom', 'start', 'end', 'name', 'average expression', 'strand', 'perc_sample','n_sample', 'avg_tpm', 'polyatype', 'polyasignal'])\n",
    "    polyadf['chrom'] = 'chr'+polyadf['chrom'].astype(str)\n",
    "    \n",
    "#     # filter for specific types\n",
    "#     polydf = polyadf.loc[polyadf['polyatype'].isin(included_polyA_type)]\n",
    "                         \n",
    "#     # make into bed\n",
    "#     polya = BedTool.from_dataframe(polyadf)\n",
    "    polya, polyasignal_bed = polyAtype_specific_coords('TE', polyadf = polyadf)\n",
    "    \n",
    "    print(f'found {len(polya)} polyA sites, {len(polyasignal_bed)} polyA signals')\n",
    "else:\n",
    "    polya = None\n",
    "    polyasignal_bed = None\n",
    "    print(f'no polyA annotation from file, {md.settings.polya_fname}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# What Metadensity need is a dictionary pickled, saved in the data/ folder.\n",
    "Like in the `build_transcript_dict` function.\n",
    "\n",
    "The dictionary contains several levels\n",
    "- First level: ensembl ID -> dictionary\n",
    "- Second level: (1 gene/1 transcript) -> information including chrom, start, end, strand and feature\n",
    "- Third level: feature dictionary {'feature names':-> set() of (start, end) tuples}\n",
    "\n",
    "To add custom features names, you will need to modify the coorisponding Metagene child class.\n",
    "For example, you want to add `polyA_site` sites to a mRNA, then you need to add `polyA_site` to the `self.featnames` of the `class Protein_coding_gene(Metagene)`.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 5: organize all features into a gene/transcript-based dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# these functions may need to be modified depending on what file format your are inputting\n",
    "def add_feature(all_dict, feature_bed, coords_with_id, name):\n",
    "    ''' update external feature into transcript/gene centric annotations'''\n",
    "    overlap_transcript = feature_bed.intersect(coords_with_id, wb = True, s = True).saveas()\n",
    "    \n",
    "    for feat in overlap_transcript:\n",
    "        transcript_id = feat[-1].split('transcript_id=')[1].split(';')[0]\n",
    "        gene_id = feat[-1].split('gene_id=')[1].split(';')[0]\n",
    "        \n",
    "        for id_ in [transcript_id, gene_id]:\n",
    "            if name not in all_dict[id_]['features'].keys():\n",
    "\n",
    "                all_dict[id_]['features'][name] = set()\n",
    "                \n",
    "            all_dict[id_]['features'][name].update([feat.start])  # POINT FEATURE ONLY!\n",
    "                \n",
    "def first_last_exon_cds(all_dict):\n",
    "    ''' Designate the most 5' CDS/exon as first_CDS/exon, the most 3' CDS/exon as last_CDS/exon\n",
    "    '''\n",
    "    \n",
    "    for key in all_dict.keys():\n",
    "        features = all_dict[key]['features']\n",
    "        \n",
    "        #### exon\n",
    "        if 'exon' in features.keys():\n",
    "            min_start = min([e[0] for e in list(features['exon'])])\n",
    "            max_start = max([e[1] for e in list(features['exon'])])\n",
    "        \n",
    "            if all_dict[key]['strand'] == '+':\n",
    "                features['first_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])\n",
    "                features['last_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])\n",
    "            else:\n",
    "                features['last_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])\n",
    "                features['first_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])\n",
    "        \n",
    "            features['exon'] = features['exon'] - features['first_exon'] - features['last_exon']\n",
    "        else:\n",
    "            #del all_dict[key] # no exon, no need to keep\n",
    "            pass\n",
    "        ## CDS\n",
    "        if 'CDS' in features.keys():\n",
    "            min_start = min([e[0] for e in list(features['CDS'])])\n",
    "            max_start = max([e[1] for e in list(features['CDS'])])\n",
    "        \n",
    "            if all_dict[key]['strand'] == '+':\n",
    "                features['first_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])\n",
    "                features['last_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])\n",
    "            else:\n",
    "                features['last_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])\n",
    "                features['first_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])\n",
    "        \n",
    "            features['CDS'] = features['CDS'] - features['first_CDS'] - features['last_CDS']\n",
    "        \n",
    "def five_three_utr(all_dict):\n",
    "    ''' hg19 annotations don't have UTR. Designate 5 prime and 3 prime UTR'''\n",
    "    for key in all_dict.keys():\n",
    "        features = all_dict[key]['features']\n",
    "        \n",
    "        #### exon\n",
    "        if 'UTR' in features.keys():\n",
    "            min_start = min([e[0] for e in list(features['UTR'])])\n",
    "            max_start = max([e[1] for e in list(features['UTR'])])\n",
    "        \n",
    "            if all_dict[key]['strand'] == '+':\n",
    "                features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])\n",
    "                features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])\n",
    "            else:\n",
    "                features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])\n",
    "                features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])\n",
    "        \n",
    "            del features['UTR']\n",
    "\n",
    "\n",
    "        \n",
    "def build_transcript_dict(gencode = gencode, outdir = md.settings.datadir,\n",
    "                         branchpoint = branchpoint, branchpoint_pred_bed = branchpoint_pred_bed,\n",
    "                         polya = polya, polyasignal_bed = polyasignal_bed):\n",
    "    ''' extract gencode coordinate and save in data\n",
    "    \n",
    "    gencode: BedTool, gencode.v33.annotations.gff3\n",
    "    outdir: md.settings.datadir\n",
    "    \n",
    "    branchpoint: BedTool\n",
    "    branchpoint_pred: BedTool\n",
    "    polya: BedTool\n",
    "    polyasignal_bed: BedTool\n",
    "    '''\n",
    "    \n",
    "    # make a directory for every coordinate\n",
    "    annotation_path = outdir\n",
    "    if not Path(annotation_path).is_file():\n",
    "        Path(annotation_path).mkdir(parents=True, exist_ok = True)\n",
    "    \n",
    "    all_dict = {}\n",
    "    \n",
    "    # filter for transcripts\n",
    "    transcript = gencode.filter(lambda x: x[2] == 'transcript').saveas()\n",
    "    exons = gencode.filter(lambda x: x[2] == 'exon').saveas()\n",
    "    intron = transcript.subtract(exons, s = True).saveas()\n",
    "    \n",
    "    print('extracting from gencode annotations')\n",
    "    for g in gencode:\n",
    "        feature_type = g[2]\n",
    "        \n",
    "        if feature_type == 'transcript' or feature_type == 'gene':\n",
    "            # start a new dict\n",
    "            all_dict[g.attrs['ID']] = {} # ENST or ENSG\n",
    "            all_dict[g.attrs['ID']]['chrom'] = g.chrom\n",
    "            all_dict[g.attrs['ID']]['start'] = g.start\n",
    "            all_dict[g.attrs['ID']]['end'] = g.end\n",
    "            all_dict[g.attrs['ID']]['strand'] = g.strand\n",
    "            all_dict[g.attrs['ID']]['id'] = g.attrs['ID']\n",
    "            all_dict[g.attrs['ID']]['type'] = g.attrs['gene_type']\n",
    "            all_dict[g.attrs['ID']]['name'] = g.attrs['gene_name']\n",
    "            all_dict[g.attrs['ID']]['features'] = {} # start to contain stuffs\n",
    "        else:\n",
    "            transcript_id = g.attrs['transcript_id'] # doesn't always equal to ID, for X,Y chromosome genes\n",
    "            gene_id = g.attrs['gene_id']\n",
    "            \n",
    "            for ids in [transcript_id, gene_id]:\n",
    "                target_dict = all_dict[ids]\n",
    "                if feature_type not in target_dict['features'].keys():\n",
    "                    target_dict['features'][feature_type] = set()\n",
    "                target_dict['features'][feature_type].update([(g.start, g.end)])\n",
    "    \n",
    "    print('building intron')\n",
    "    for i in intron:\n",
    "        feature_type = 'intron'\n",
    "        transcript_id = i.attrs['transcript_id']\n",
    "        gene_id = i.attrs['gene_id']\n",
    "            \n",
    "        for ids in [transcript_id, gene_id]:\n",
    "            target_dict = all_dict[ids]\n",
    "            if feature_type not in target_dict['features'].keys():\n",
    "                target_dict['features'][feature_type] = set()\n",
    "            target_dict['features'][feature_type].update([(i.start, i.end)])\n",
    "    \n",
    "    print('building first last exon/cds')\n",
    "    first_last_exon_cds(all_dict)\n",
    "    \n",
    "    print('building five/three prime UTR')\n",
    "    five_three_utr(all_dict)\n",
    "    \n",
    "    print('building branchpoint')\n",
    "    if branchpoint is not None:\n",
    "        add_feature(all_dict, branchpoint, intron, name = 'branchpoint') # experimental\n",
    "    if branchpoint_pred_bed is not None:\n",
    "        add_feature(all_dict, branchpoint_pred_bed, intron, name = 'branchpoint_pred')\n",
    "    \n",
    "    \n",
    "    print('building polyA')\n",
    "    if polya is not None:\n",
    "        add_feature(all_dict, polya, transcript, name = 'polyAsite') # experimental\n",
    "    if polyasignal_bed is not None:\n",
    "        add_feature(all_dict, polyasignal_bed, transcript, name = 'polyAsignal') # experimental\n",
    "    \n",
    "    print('writing to directory')\n",
    "    with open(os.path.join(annotation_path, 'gencode'), 'wb') as f:\n",
    "        pickle.dump(all_dict, f)\n",
    "    return all_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:\n",
      "GL000009.2\tENSEMBL\texon\t56140\t58376\t.\t-\t.\tID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical\n",
      "\n",
      "***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:\n",
      "GL000009.2\tENSEMBL\texon\t56140\t58376\t.\t-\t.\tID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "extracting from gencode annotations\n",
      "building intron\n",
      "building first last exon/cds\n",
      "building five/three prime UTR\n",
      "building branchpoint\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n",
      "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n",
      "\n",
      "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n",
      "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n",
      "\n",
      "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n",
      "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n",
      "\n",
      "***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:\n",
      "GL000194.1\tENSEMBL\ttranscript\t55677\t112791\t.\t-\t.\tID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "building polyA\n",
      "writing to directory\n"
     ]
    }
   ],
   "source": [
    "d = build_transcript_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'chrom': 'chr1',\n",
       " 'start': 11868,\n",
       " 'end': 14409,\n",
       " 'strand': '+',\n",
       " 'id': 'ENSG00000223972.5',\n",
       " 'type': 'transcribed_unprocessed_pseudogene',\n",
       " 'name': 'DDX11L1',\n",
       " 'features': {'exon': {(12009, 12057),\n",
       "   (12178, 12227),\n",
       "   (12612, 12697),\n",
       "   (12612, 12721),\n",
       "   (12974, 13052),\n",
       "   (13220, 13374),\n",
       "   (13452, 13670)},\n",
       "  'intron': {(12227, 12612), (12721, 12974), (13052, 13220)},\n",
       "  'first_exon': {(11868, 12227)},\n",
       "  'last_exon': {(13220, 14409)},\n",
       "  'branchpoint_pred': {12950, 13193, 13201}}}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# take a look at what's inside\n",
    "d[list(d.keys())[0]] #"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "metadensity",
   "language": "python",
   "name": "metadensity"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}