Comparing eCLIP with DeSeq results using Metadensity#
This allows us to generate hypothesis about RBP function - whether it stabilize or destabilize RNA?
[1]:
# set up files associated with each genome coordinates
import metadensity as md
md.settings.from_config_file('/home/hsher/Metadensity/config/hg38.ini')
# then import the modules
from metadensity.metadensity import *
from metadensity.plotd import *
import pandas as pd
import matplotlib.pyplot as plt
# I have a precompiles list of ENCODE datas as a csv that loads in this dataloader
import sys
sys.path.append('/home/hsher/Metadensity/scripts')
from dataloader import *
%matplotlib inline
plt.style.use('seaborn-white')
Using /home/hsher/gencode_coords/GRCh38.p13.genome.fa
using /home/hsher/gencode_coords/GRCh38.p13.genome.fa
Using: /home/hsher/gencode_coords/gencode.v33.transcript.gff3
Get all the DESeq files for each KD experiment#
Here we use ENCODE KD-RNA-seq DeSeq outputs
[2]:
# parse DE filenames
de_path = '/home/hsher/deseq_gccor/normalized/'
all_de_files = os.listdir(de_path)
all_de_files.remove('result_URLs_HepG2.txt')
all_de_files.remove('result_URLs_K562.txt')
all_de_files_rbp = [f.split('-')[0] for f in all_de_files if '-' in f]
all_de_files_cell = [f.split('-')[2].split('_')[0] for f in all_de_files if '-' in f]
de_df = pd.DataFrame([all_de_files, all_de_files_rbp, all_de_files_cell]).T
de_df.columns = ['fname', 'rbp', 'cell line']
[3]:
de_df.head()
[3]:
| fname | rbp | cell line | |
|---|---|---|---|
| 0 | PRPF8-BGHLV17-HepG2_DESeq2_output.txt | PRPF8 | HepG2 |
| 1 | RECQL-LV08-K562_DESeq2_output.txt | RECQL | K562 |
| 2 | TRIM56-BGHLV20-HepG2_DESeq2_output.txt | TRIM56 | HepG2 |
| 3 | ESF1-BGHLV30-HepG2_DESeq2_output.txt | ESF1 | HepG2 |
| 4 | MTPAP-BGKLV34-K562_DESeq2_output.txt | MTPAP | K562 |
The main idea is to build Metagene density on upregulated,dowregulated transcripts, and thos in the nochange group#
[4]:
def DEseq_result(fname):
''' return transcript ID that is up and down regulated'''
# read DEseq results
de = pd.read_csv(de_path+ fname, sep = '\t', header = 0, index_col = 0)
# select those significant
de = de[de['padj']< 0.05]
de.reset_index(inplace = True)
# select fold change high
up = de.loc[de['log2FoldChange']> 1]
down = de.loc[de['log2FoldChange']< -1]
# remove version
up = [i.split('.')[0] for i in up['index'].tolist()]
down = [i.split('.')[0] for i in down['index'].tolist()]
return up, down
def find_direct_bind(id_list, eCLIP, filter_direct = True, enough_transcript = None):
''' find DE genes that has at least 1 idr peak'''
subset = transcript.filter(lambda x: x.attrs['gene_id'].split('.')[0] in id_list and x.attrs['ID'] in enough_transcript).saveas()
if filter_direct:
direct_bind_transcript = subset.intersect(eCLIP.idr, s = True, u = True).saveas()
else:
return subset
return direct_bind_transcript
def main(cell_line, rbp, filter_direct = True, thres = 200):
''' main function: fetch DESeq transcripts, build metagene for each group'''
# start eCLIP object for you, using internal encode data sheet
try:
s = encode_data.loc[(encode_data['RBP'] == rbp)&(encode_data['Cell line'] == cell_line)].iloc[0]
e = eCLIP.from_series(s)
except:
s = encode4_data.loc[(encode4_data['RBP'] == rbp)&(encode4_data['Cell line'] == cell_line)].iloc[0]
e = eCLIP.from_series(s, single_end = True)
print('using encode4')
# find transcripts with at least `thres` reads
transcripts_with_reads = e.enough_transcripts(thres = thres)
print(len(transcripts_with_reads))
# fetch DE genes
fname = de_df.loc[(de_df['rbp'] == rbp) & (de_df['cell line'] == cell_line), 'fname'].values[0]
up, down = DEseq_result(fname)
# further filter for those with at least 1 peak. This step can be ignored
up_transcripts = find_direct_bind(up, e, filter_direct = filter_direct, enough_transcript = transcripts_with_reads)
down_transcripts = find_direct_bind(down, e, filter_direct = filter_direct, enough_transcript = transcripts_with_reads)
backgrounds = set(transcripts_with_reads) - set([t.attrs['ID'] for t in up_transcripts])- set([t.attrs['ID'] for t in down_transcripts])
print('# up reg: {} # down reg: {} background: {}'.format(len(up_transcripts), len(down_transcripts), len(backgrounds)))
# then we create metadensity for each of them
meta_up = Metadensity(e, name=rbp+'_upreg', transcripts = up_transcripts, background_method = 'relative information', normalize = False)
meta_down = Metadensity(e, name=rbp+'_downreg', transcripts = down_transcripts, background_method = 'relative information', normalize = False)
meta_back = Metadensity(e, name=rbp+'_nochange', transcript_ids= list(backgrounds), background_method = 'relative information', normalize = False)
# run
for m in [meta_up, meta_down, meta_back]:
m.get_density_array()
return [meta_up, meta_down, meta_back]
Example 1: DDX3X#
[5]:
from collections import defaultdict
def get_color_dict(rbp):
color_dict = defaultdict(lambda: 'seagreen')
color_dict[rbp+'_upreg'] = 'tomato'
color_dict[rbp+'_downreg'] = 'royalblue'
color_dict[rbp+'_nochange'] = 'grey'
return color_dict
[6]:
ddx_metas = main('HepG2', 'DDX3X', filter_direct = False)
4578
# up reg: 113 # down reg: 689 background: 3776
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
[7]:
f = plot_mean_density(ddx_metas, plot_std = True,
stat = 'mean',
ymax = 0.003,
alpha = 0.3,
features_to_show =
protein_coding, color_dict = get_color_dict('DDX3X'))
f = beautify(f)
[8]:
# individually
f = plot_rbp_map(ddx_metas, ymax = 0.003, alpha = 0.3, features_to_show = protein_coding)
/home/hsher/miniconda3/envs/metadensity/lib/python3.7/site-packages/metadensity/plotd.py:187: RuntimeWarning: Mean of empty slice
density_concat = np.nanmean(np.stack([den_arr[feat,align, r] for r in m.eCLIP.rep_keys]), axis = 0)
Example 2: UPF1: helicase needed for non-sense mediated decay#
[9]:
upf_metas = main('HepG2', 'UPF1', filter_direct = False)
3206
# up reg: 225 # down reg: 28 background: 2953
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg38/gencode
Done building metagene
[10]:
f = plot_mean_density(upf_metas, plot_std = True, stat = 'mean', ymax = 0.0005, alpha = 0.3, features_to_show = protein_coding, color_dict = get_color_dict('UPF1'))
f = beautify(f)