How to load your own data & Computing in different genome coordinates & Binding pattern of specific gene!#

the rule is the same. You are just feeding the list of genes in that GO pathway into metadensity and compute them

[1]:

# set up files associated with each genome coordinates
# you need to edit your own .ini files to point the gencode annotation files
import metadensity as md
md.settings.from_config_file('/home/hsher/Metadensity/config/hg19.ini')


# then import the modules
from metadensity.metadensity import *
from metadensity.plotd import *
import pandas as pd
import matplotlib.pyplot as plt


# I have a precompiles list of ENCODE datas as a csv that loads in this dataloader
import sys
sys.path.append('/home/hsher/Metadensity/scripts')
from dataloader import *
%matplotlib inline

plt.style.use('seaborn-white')

Using /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa
using /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa
Using:  /home/hsher/gencode_coords/gencode.v19.transcript.gff3

Creating eCLIP object for your data#

the eCLIP object needs a pd.Series that stores all the types of files

[2]:

base_dir = '/home/hsher/jackie_ythdf2/'
from metadensity.readdensity import ReadDensity
def gather_data(cell_line):
    # store where all the files are
    data = pd.Series()

    # iterate over specific naming principle.
    for rep, rname in zip([1,2], ['R1', 'R2']):
        for seq in ['CLIP', 'INPUT']:
            # ===== BIGWIGS ======
            pos = '{}YTHDF2_trackhubs/hg19/YTHDF2.{}_{}_{}.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.norm.{}.bw'.format(
            base_dir, seq, cell_line,rname, 'pos')
            neg = '{}YTHDF2_trackhubs/hg19/YTHDF2.{}_{}_{}.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.norm.{}.bw'.format(
            base_dir, seq, cell_line,rname, 'neg')

            if seq == 'CLIP':
                data[f'minus_{rep-1}'] = neg
                data[f'plus_{rep-1}'] = pos
            if seq == 'INPUT':
                data[f'minus_control_{rep-1}'] = neg
                data[f'plus_control_{rep-1}'] = pos

            # ===== BAMS ======
            bam = '{}YTHDF2.{}_{}_{}.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam'.format(
            base_dir, seq, cell_line, rname)

            if seq == 'CLIP':
                data[f'bam_{rep-1}'] = bam

            if seq == 'INPUT':
                data[f'bam_control_{rep-1}'] = bam

            # ===== INFORMATION =====

            # names
            data['uid']=cell_line # THIS THING NEEDS TO BE UNIQUE TO EACH ECLIP LIBRARY. it is called "Unique ID" (uid)
            # if uid is NOT UNIQUE it will RUIN EVERYTHING!
            data['RBP']=cell_line # name for display when plotting. You can write whatever there. won't interfere computation

            # ==== PEAKS ====
            if seq == 'CLIP':

                fname = '{}YTHDF2.{}_{}_{}.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.peakClusters.normed.bed'.format(
                    base_dir, seq, cell_line, rname)

                data[f'bed_{rep-1}'] = fname
    # Intersect peaks

    fname = '{}idr_peaks/{}_1-{}_2.idr.final.bed'.format(base_dir, cell_line, cell_line)
    data['idr'] = fname



    return data

[3]:

#For reference LM2 and MDA samples are MYC-dependent, MCF7 and SKBR3 are MYC independent
# these data are for YTHDF2 CLIP in different cell lines
lm2 = gather_data('LM2')
mda = gather_data('MDA')
mcf7 = gather_data('MCF7')
SKBR3 = gather_data('SKBR3')

/home/hsher/miniconda3/envs/metadensity/lib/python3.7/site-packages/ipykernel_launcher.py:5: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  """

[4]:

lm2 # how the Series should look like

[4]:

minus_0            /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
plus_0             /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
bam_0              /home/hsher/jackie_ythdf2/YTHDF2.CLIP_LM2_R1.u...
uid                                                              LM2
RBP                                                              LM2
bed_0              /home/hsher/jackie_ythdf2/YTHDF2.CLIP_LM2_R1.u...
minus_control_0    /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
plus_control_0     /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
bam_control_0      /home/hsher/jackie_ythdf2/YTHDF2.INPUT_LM2_R1....
minus_1            /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
plus_1             /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
bam_1              /home/hsher/jackie_ythdf2/YTHDF2.CLIP_LM2_R2.u...
bed_1              /home/hsher/jackie_ythdf2/YTHDF2.CLIP_LM2_R2.u...
minus_control_1    /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
plus_control_1     /home/hsher/jackie_ythdf2/YTHDF2_trackhubs/hg1...
bam_control_1      /home/hsher/jackie_ythdf2/YTHDF2.INPUT_LM2_R2....
idr                /home/hsher/jackie_ythdf2/idr_peaks/LM2_1-LM2_...
dtype: object

[5]:

# it's worthwhile to see if the data path is correct
import os
lm2.apply(os.path.isfile) # ok it's good to learn every path exists! yeee ha

[5]:

minus_0             True
plus_0              True
bam_0               True
uid                False
RBP                False
bed_0               True
minus_control_0     True
plus_control_0      True
bam_control_0       True
minus_1             True
plus_1              True
bam_1               True
bed_1               True
minus_control_1     True
plus_control_1      True
bam_control_1       True
idr                 True
dtype: bool

[6]:

clips = []

# then you build eCLIP from each pd.Series!
for data in [lm2, mda, mcf7, SKBR3]:

    clips.append(eCLIP.from_series(data, single_end = True))

Build Metagene for MYC only#

[7]:

myc = transcript.filter(lambda x: x.attrs['gene_name']=='MYC').saveas()
print(myc)

chr8    HAVANA  transcript      128748330       128753674       .       +       .       ID=ENST00000377970.2;Parent=ENSG00000136997.10;gene_id=ENSG00000136997.10;transcript_id=ENST00000377970.2;gene_type=protein_coding;gene_status=KNOWN;gene_name=MYC;transcript_type=protein_coding;transcript_status=KNOWN;transcript_name=MYC-001;level=2;protein_id=ENSP00000367207.2;tag=basic,CCDS;ccdsid=CCDS6359.2;havana_gene=OTTHUMG00000128475.3;havana_transcript=OTTHUMT00000250277.3;

[8]:

all_meta_raw = []
for e in clips:
    m = Metadensity(e, name = e.name, transcripts = myc,
                    background_method = None,
                    normalize = None) # raw read coverage in RPM
    m.get_density_array()
    all_meta_raw.append(m)

Using: /home/hsher/Metadensity/metadensity/data/hg19/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg19/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg19/gencode
Done building metagene
Using: /home/hsher/Metadensity/metadensity/data/hg19/gencode
Done building metagene

[9]:

f = plot_mean_density(all_meta_raw, features_to_show = generic_rna, ymax = 10)
plt.suptitle('YTHDF2 Raw RPM on MYC gene')
f=beautify(f)

[ ]: