Precomputed densities and Heatmaps#

It is not possible to compute lots of Metadensity in memory. So I precompute many of them and store as files

How to send cluster script, please see scripts/run_metadensity.py and scripts/metasubmit.sh as an example

how to load precomputed data#

with the Metadensity.save_deepdish() function, you get an .h5 file that stores the density

[1]:

import deepdish as dd
import os

# let's just load 1 example
base_dir = '/home/hsher/densities'
example = dd.io.load(os.path.join(base_dir, '{}_densityarr.h5'.format('203')))

[2]:

example.keys() # it's essentially the same as the density_array

[2]:

dict_keys([('first_exon', 'left', 'rep1'), ('first_exon', 'left', 'rep2'), ('first_exon', 'right', 'rep1'), ('first_exon', 'right', 'rep2'), ('exon', 'left', 'rep1'), ('exon', 'left', 'rep2'), ('exon', 'right', 'rep1'), ('exon', 'right', 'rep2'), ('intron', 'left', 'rep1'), ('intron', 'left', 'rep2'), ('intron', 'right', 'rep1'), ('intron', 'right', 'rep2'), ('branchpoint', 'left', 'rep1'), ('branchpoint', 'left', 'rep2'), ('branchpoint', 'right', 'rep1'), ('branchpoint', 'right', 'rep2'), ('branchpoint_pred', 'left', 'rep1'), ('branchpoint_pred', 'left', 'rep2'), ('branchpoint_pred', 'right', 'rep1'), ('branchpoint_pred', 'right', 'rep2'), ('last_exon', 'left', 'rep1'), ('last_exon', 'left', 'rep2'), ('last_exon', 'right', 'rep1'), ('last_exon', 'right', 'rep2'), ('five_prime_UTR', 'left', 'rep1'), ('five_prime_UTR', 'left', 'rep2'), ('five_prime_UTR', 'right', 'rep1'), ('five_prime_UTR', 'right', 'rep2'), ('first_CDS', 'left', 'rep1'), ('first_CDS', 'left', 'rep2'), ('first_CDS', 'right', 'rep1'), ('first_CDS', 'right', 'rep2'), ('CDS', 'left', 'rep1'), ('CDS', 'left', 'rep2'), ('CDS', 'right', 'rep1'), ('CDS', 'right', 'rep2'), ('last_CDS', 'left', 'rep1'), ('last_CDS', 'left', 'rep2'), ('last_CDS', 'right', 'rep1'), ('last_CDS', 'right', 'rep2'), ('three_prime_UTR', 'left', 'rep1'), ('three_prime_UTR', 'left', 'rep2'), ('three_prime_UTR', 'right', 'rep1'), ('three_prime_UTR', 'right', 'rep2')])

[3]:

avail_uid = [f.split('_')[0] for f in os.listdir(base_dir) if f.endswith('densityarr.h5')]

Load a list of RBP collaborators are interested in#

[4]:

import pandas as pd
# read it
rbps = pd.read_csv('Final Hits.csv', header = None)[0].tolist()

[5]:

# let's see which of those have eCLIP data. the dataloader script will return what encode has
import sys
sys.path.append('/home/hsher/Metadensity/scripts/')
from dataloader import master_df, encode_data, encode4_data

[6]:

master_df.head()

[6]:

	uid	RBP	Cell line
0	203	HNRNPC	HepG2
1	204	RBFOX2	HepG2
2	205	IGF2BP1	HepG2
3	206	HNRNPK	HepG2
4	209	SRSF7	HepG2

[7]:

with_data = master_df.loc[(master_df['RBP'].isin(rbps)) & (master_df['uid'].isin(avail_uid))]

[8]:

with_data.head()

[8]:

	uid	RBP	Cell line
7	216	SRSF9	HepG2
13	226	IGF2BP2	K562
14	227	SLTM	HepG2
27	256	SLTM	K562
34	278	PRPF8	K562

[9]:

with_data.shape # so we have 46 data matching their need!

[9]:

(45, 3)

Now im gonna load all precomputed densities for the 45, and combine them into a heatmap.#

[10]:

import numpy as np
import seaborn as sns
rna = ['first_exon', 'exon', 'intron', 'last_exon']
protein_coding = ['five_prime_UTR', 'first_CDS', 'CDS', 'last_CDS', 'three_prime_UTR']
################## I KNOW THIS IS INELEGANT BUT WE STILL NEED TO SET COLORS #################
color_dict = {'5\' UTR start': 'tan',
             '5\' UTR end': 'darkgoldenrod',
              'first exon start': 'aquamarine',
              'first exon end': 'mediumaquamarine',
              'exon start': 'mediumseagreen',
              'exon end': 'seagreen',
              'last exon start': 'limegreen',
              'last exon end': 'forestgreen',

              'first CDS start': 'rebeccapurple',
              'first CDS end': 'mediumpurple',
              'CDS start': 'darkorchid',
              'CDS end': 'mediumorchid',
              'last CDS start': 'slateblue',
              'last CDS end': 'darkslateblue',

              'intron start': 'cadetblue',
              'intron end': 'darkcyan',

              '3\' UTR start': 'hotpink',
              '3\' UTR end': 'deeppink',
              'branchpoint start':'tomato',
              'branchpoint end':'darkorange',
              'machine predicted branchpoint start': 'lightcoral',
              'machine predicted branchpoint end': 'rosybrown'
     }
key_to_name = {'exon': 'exon',
 'last_CDS': 'last CDS',
 'intron': 'intron',
 'first_CDS': 'first CDS',
 'last_exon': 'last exon',
 'three_prime_UTR': '3\' UTR',
 'five_prime_UTR': '5\' UTR',
 'first_exon': 'first exon',
 'CDS': 'CDS',
    'branchpoint':'branchpoint',
    'branchpoint_pred': 'machine predicted branchpoint'}

def read_precomputed_array(uid, base_dir = base_dir):
    ''' load deep dish h5 into data structure'''
    return dd.io.load(os.path.join(base_dir, '{}_densityarr.h5'.format(uid)))
def get_feature_length(denarray):
    ''' return feature length for each feature in dictionary'''
    features = list(set([d[0] for d in denarray.keys()]))
    feature_len = [denarray[f, 'left', 'rep1'].shape[1] for f in features]
    return dict(zip(features, feature_len))
def merge_two_reps(denarray, features = rna):
    ''' merge two rep, return mean density'''
    all_mean = []
    for f in features:
        for align in ['left', 'right']:

            rep1 = denarray[f, align, 'rep1']
            rep2 = denarray[f, align, 'rep2']

            all_den = np.concatenate([rep1,rep2], axis = 0) # concat into 1 array
            mean = np.nanmean(all_den, axis = 0)
            all_mean.append(mean)
    return(np.concatenate(all_mean))
def into_one_df(uids, features = rna):
    ''' given a list of uid, concat all features and data into 1 df'''
    all_vector = []
    i = 0
    for uid in uids:
        denarray = read_precomputed_array(uid)
        if i == 0:
            flen = get_feature_length(denarray)
        all_vector.append(merge_two_reps(denarray, features)) # append into results
        i+=1



    df = pd.DataFrame(np.stack(all_vector), index = uids)
    df.fillna(0, inplace = True)
    df.replace(np.nan, 0, inplace = True)

    return df , flen

def get_feat_color(features, flen):
    ''' generate color annotation for heatmap'''
    # get feature length
    colors = []
    names = []
    for f in features:
        length = flen[f]
        name = key_to_name[f]

        for align in ['start', 'end']:
            color_key = '{} {}'.format(name, align)
            names.append(color_key)
            colors += [color_dict[color_key]]*length

    return colors, names

[11]:

df, flen = into_one_df(with_data['uid'].tolist(), rna)
rna_color, names = get_feat_color(rna, flen)

[12]:

df.head() # that's what we get, row = 1 rbp, col = 1 metagene position

[12]:

	0	1	2	3	4	5	6	7	8	9	...	3890	3891	3892	3893	3894	3895	3896	3897	3898	3899
216	0.000091	0.000101	0.000105	0.000115	0.000125	0.000129	0.000135	0.000138	0.000145	0.000151	...	0.000003	0.000003	0.000003	0.000004	0.000003	0.000003	0.000004	0.000003	0.000004	0.000003
226	0.000082	0.000087	0.000094	0.000099	0.000099	0.000112	0.000128	0.000146	0.000157	0.000166	...	0.000037	0.000033	0.000030	0.000028	0.000026	0.000024	0.000023	0.000020	0.000018	0.000017
227	0.000060	0.000061	0.000064	0.000070	0.000070	0.000072	0.000074	0.000076	0.000081	0.000083	...	0.000008	0.000008	0.000007	0.000009	0.000009	0.000009	0.000005	0.000004	0.000004	0.000004
256	0.000056	0.000057	0.000061	0.000063	0.000065	0.000066	0.000069	0.000073	0.000086	0.000091	...	0.000007	0.000006	0.000006	0.000006	0.000007	0.000007	0.000007	0.000008	0.000007	0.000009
278	0.000029	0.000029	0.000030	0.000028	0.000027	0.000026	0.000033	0.000033	0.000032	0.000032	...	0.000020	0.000019	0.000019	0.000019	0.000017	0.000017	0.000014	0.000013	0.000013	0.000012

5 rows × 3900 columns

[13]:

ylabelnames = with_data['RBP']+'_'+with_data['Cell line']
g = sns.clustermap(df,
                   metric ="correlation", method = "complete",
                   row_cluster = True,
                   col_colors = rna_color, col_cluster = False,
                   yticklabels=ylabelnames,
                   figsize=(16, 10), cmap = 'YlGnBu', cbar_pos=(1.1, .2, .03, .4), colors_ratio = 0.015, vmax = 0.0007)
g.ax_heatmap.set_ylabel('RBPs')
g.ax_heatmap.set_xlabel('metagene position')

for label in names:
    g.ax_col_dendrogram.bar(0, 0, color=color_dict[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="best", bbox_to_anchor=(-0.05, 1.5), ncol=1)
g.savefig("cluster_rna.svg", dpi = 300, format = 'svg')

/home/hsher/miniconda3/envs/metadensity/lib/python3.7/site-packages/seaborn-0.11.0-py3.7.egg/seaborn/matrix.py:659: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

_images/Heatmap_from_precomputed_data_15_1.png

let’s do it for another feature#

[14]:

df, flen = into_one_df(with_data['uid'].tolist(), protein_coding)
cds_color, names = get_feat_color(protein_coding, flen)

ylabelnames = with_data['RBP']+'_'+with_data['Cell line']
g = sns.clustermap(df,
                   metric ="correlation", method = "complete",
                   row_cluster = True,
                   col_colors = cds_color, col_cluster = False,
                   yticklabels=ylabelnames,
                   figsize=(16, 10), cmap = 'YlGnBu', cbar_pos=(1.1, .2, .03, .4), colors_ratio = 0.015, vmax = 0.0007)
g.ax_heatmap.set_ylabel('RBPs')
g.ax_heatmap.set_xlabel('metagene position')

for label in names:
    g.ax_col_dendrogram.bar(0, 0, color=color_dict[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="best", bbox_to_anchor=(-0.05, 1.5), ncol=1)
g.savefig("cluster_cds.svg", dpi = 300, format = 'svg')

/home/hsher/miniconda3/envs/metadensity/lib/python3.7/site-packages/seaborn-0.11.0-py3.7.egg/seaborn/matrix.py:659: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

_images/Heatmap_from_precomputed_data_17_1.png

[15]:

branchpoints = ['branchpoint', 'branchpoint_pred']
df, flen = into_one_df(with_data['uid'].tolist(), branchpoints)
br_color, names = get_feat_color(branchpoints, flen)

ylabelnames = with_data['RBP']+'_'+with_data['Cell line']
g = sns.clustermap(df,
                   metric ="correlation", method = "complete",
                   row_cluster = True,
                   col_colors = br_color, col_cluster = False,
                   yticklabels=ylabelnames,
                   figsize=(16, 10), cmap = 'YlGnBu', cbar_pos=(1.1, .2, .03, .4), colors_ratio = 0.015, vmax = 0.0007)
g.ax_heatmap.set_ylabel('RBPs')
g.ax_heatmap.set_xlabel('metagene position')

for label in names:
    g.ax_col_dendrogram.bar(0, 0, color=color_dict[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="best", bbox_to_anchor=(-0.05, 1.5), ncol=1)
g.savefig("cluster_br.svg", dpi = 300, format = 'svg')

_images/Heatmap_from_precomputed_data_18_0.png

[ ]: