metadensity package#

Submodules#

metadensity.config module#

class metadensity.config.MetadensityConfig#

Bases: object

__init__()#

from_config_file(filename)#

metadensity.kmer_from_read module#

metadensity.kmer_from_read.get_all_site_seq(bam, chrom='chr1', start=0, end=248956422, strand='+', window=25, single_end=False, read2=True)#: fetch sequence around read start sites

metadensity.kmer_from_read.main(bam_rep1, bam_rep2, bam_input1, bam_input2=None, k=7, chrom='chr1', start=0, end=248956422, strand='+', window=25, n_sample=1000, n_iter=100, single_end=False)#

metadensity.metadensity module#

metadensity.metadensity.Build_many_metagene(id_list=None, sample_no=200, transcript_type='protein_coding')#

[Create Metagene object for regions for key transcript]

Parameters:

id_list ([list], optional) – [List of transcript/gene IDs]. Defaults to None.
sample_no (int, optional) – [number of genes/transcripts to calculate]. Defaults to 200.
transcript_type (str, optional) – [set to None to build all transcript regardless]. Defaults to ‘protein_coding’.

Returns:

[dictionary of ID -> Metagene object]

Return type:

[dict]

class metadensity.metadensity.Meta(eCLIP, name, sample_no=200, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None)#

Bases: object

superclass of to inherit from

__init__(eCLIP, name, sample_no=200, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None)#

apply(func, axis=0)#

[apply function to all features in self.density_array ]

Parameters:

func ([fuction]) – [examples likenp.nanmean, np.median, np.std]
axis (int, optional) – [along with axis do the function applies to]. Defaults to 0.

Returns:

[dictionary, keys = [feat, align, rep], values = np.array]

Return type:

[dict]

build_metagene(sample_no=200, tids=None)#: Use function Build_many_metagenes() to get regions in UTR, intron, exon for each transcripts in self.idr_transcript and self.no_peak; store metagene in self.idr_metagene/self.neg_metagene

concat_density_array(rep='rep1', features=['exon', 'intron'])#

[concatenate density array along specified features. For examples, exon is (n * 200), intron is (n * 500), then the concat array will be (n * 700)]

Parameters:

rep (str, optional) – [which replicate to concat]. Defaults to ‘rep1’.
features (list, optional) – [what features to concat]. Defaults to ‘self.featnames’.

Returns:

[concatenated array]

Return type:

[np.array]

convolve(filter, axis=0)#

[wrapper around np.convolve, to run in all features, use to smooth out signals ]

Parameters:

filter ([np.array]) – [v in np.convolve]
axis (int, optional) – [which dimension to do]. Defaults to 1.

Returns:

[dictionary, keys = [feat, align, rep], values = np.array]

Return type:

[dict]

get_density_array(use_truncation=False, full_CDS=True)#

[Extract density for all features in all metagene, wrap around self.get_feature_density_array]

Parameters:

use_truncation (bool, optional) – [Whether to use truncation or not. If use truncation, fetch values from metagene.truncations. else fetch from metagene.densities]. Defaults to False.
full_CDS (bool, optional) – [Whether to concat all CDS and make a feature called ‘full_CDS’]. Defaults to True.

get_feature_density_array(feature, target_len, align, pad_value=nan, use_truncation=False)#

[align features by padding np.nan to the 5’ or 3’ end]

Parameters:

feature ([str]) – [type of feature to fetch. keys of metagene.features, can be ‘exon’, ‘first_exon’, ‘intron’…etc. check metagene.features]
target_len ([int]) – [The windowing length of that feature.]
align ([str]) – [whether to align to 5 prime or 3 prime, ‘left’(5 prime) or ‘right’(3 prime)]
pad_value ([float], optional) – [What value to pad if the feature is shorter than designated target_len]. Defaults to np.nan.
use_truncation (bool, optional) – [Whether to use truncation or not. If use truncation, fetch values from metagene.truncations. else fetch from metagene.densities]. Defaults to False.

save_deepdish(outdir)#

[save self.density_array to deepdish]

Parameters:: outdir ([str]) – [path to save]

scale_density_array(method='norm_by_sum')#

[scale density array by only the features considered]

Parameters:: method (str, optional) – [‘norm_by_sum’, ‘max_scalar’]. Defaults to ‘norm_by_sum’.
Returns:: [dictionary, keys = [feat, align, rep], values = np.array]
Return type:: [dict]

class metadensity.metadensity.Metadensity(eCLIP, name, sample_no=200, background_method='subtract', normalize=True, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None)#

Bases: Meta

[Metagene using read coverage (RBP protected fragments)]

Parameters:: Meta ([cls]) – [superclass]

__init__(eCLIP, name, sample_no=200, background_method='subtract', normalize=True, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None)#

[Create a Metadensity class (using read coverage)]

Parameters:

eCLIP ([eCLIP]) – [eCLIP object containing the .bam .bw]
name ([str]) – [The name of this object. Will be used for plotting]
sample_no (int, optional) – [Number of transcripts to calculate. If is shorter than the transcripts specifies, then it will use the first n]. Defaults to 200.
background_method (str, optional) – [‘subtract’; ‘subtract normal’; ‘relative information’]. Defaults to ‘subtract’.
normalize (bool, optional) – [True to use % of edits in position; False to use raw background subtract signals]. Defaults to True.
metagenes ([dictionary of metagenes], optional) – [Use pre-built metagene. Will override sample_no, transcripts and transcript_ids]. Defaults to None.
transcripts ([BedTool], optional) – [Use BedTool containing transcripts]. Defaults to None.
transcript_ids ([list], optional) – [list of transcript or gene ids. the ids needs to be in the pre-built coodinate]. Defaults to None.
deep_dish_path ([str], optional) – [path to precomputed density array deepdish.h5. If specified, directly load from path and override all options]. Defaults to None.

Example

If you want to create Metadensity on a subset of transcripts, here are some ways specify by Enesmbl gene id or transcript id by transcript_ids=. Make sure this ID exists in the annotation version you use!

Metadensity(eCLIP_obj, 'some_eclip', transcript_ids = ['ESNG123456.7'])

To find unique transcripts contain at least 1 region (e.g. peaks) specified, use transcripts= :

your_peak_pybedtool_object = BedTool('path_to_your_peak_bed_file')
selected_transcript = transcript.intersect(your_peak_pybedtool_object, s = True, u = True) # transcript is a built-in pybedtool obj
Metadensity(eCLIP_obj), 'some_eclip', transcripts = selected_transcript)

if you are calculating metadensity in the same subset of genes over multiple clips, you can pre-build Metagene object to save time! use metagene= :

transcript_id_interested = ['ENSG12345.7']
metagene_of_interest = Build_many_metagene(id_list = transcript_id_interested)
m1=Metadensity(eCLIP_obj_1, 'eclip_one_on_these_genes', metagenes = metagene_of_interest)
m2=Metadensity(eCLIP_obj_2, 'eclip_two_on_these_genes', metagenes = metagene_of_interest)

If you specify none of transcripts, transcript_ids or metagene, it will inherently use transcript containing at least 1 IDR peak. If there is no IDR (eCLIP.idr) peak, it falls back to transcript containing at least 1 replicate’s peak. If no peaks are specified in the eCLIP object, it fails.

get_metadensity(background_method='subtract', normalize=True)#: calculate metadensity for each metagene in self.idr_metagene or self.neg_metagene. store density in each metagene object

class metadensity.metadensity.Metagene(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: object

[A class to hold a gene and its feature]

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

concat_density(uID, rep, truncation=False)#: since some feature would have left/right, we need a special function to return concat density

concat_multi_feature(value, feature='exon')#

[concatentae multiple feature in order, for example, multiple exon/CDS, from 5’ to 3’]

Parameters:

value ([np.array]) – [the original value to slice from]
feature (str, optional) – [type of feature, ‘exon’ or ‘cds’]. Defaults to ‘exon’.

Returns:

[concatenate density of multiple feature]

Return type:

[np.array]

coverage_value(eCLIP, rep, interval)#

[return raw RPM value from bigwig files for each position in feature from 5’ to 3’ end]

Parameters:

eCLIP ([eCLIP]) – [eCLIP object]
rep ([str]) – [‘rep1’]
interval ([tuple]) – [(int, int), absolute genome position of interval]

Returns:

[values from bigwig, 5 prime to 3 prime]

Return type:

[np.array]

create_downstream_feature(query_interval, feature_type='intron', feature_name='terminal_exon', length=100)#

[create closest, 3’ feature of the query interval for example, intron list: (100,200), (400,500), (700,900), query exon (500,700), return (700,900) if ‘+’ else (400,500)]

Parameters:

query_interval ([tuple]) – [(int,int), the interval if interest]
feature_type (str, optional) – [‘exon’,’intron’]. Defaults to ‘intron’.
feature_name (str, optional) – [name of the query feature]. Defaults to ‘terminal_exon’.

Returns:

[nothing]

Return type:

[None]

create_feature(interval, feature_name, length=50)#

[create new feature in metagene]

Parameters:

interval ([int or tuple]) – [if int, create point feature at position x; if tuple of (int, int), create interval.Absolute value in genome coordinate]
feature_name ([str]) – [name of the feature]

Example

branchpoint for this gene is 500. The gene is from (100, 300). create_feature(interval=500, feature_name = 'branchpoint')

a snoRNA is nested in this gene from (130, 150). create_feature(interval(130,150), feature_name = 'snoRNA')

create_upstream_feature(query_interval, feature_type='intron', feature_name='terminal_exon', length=100)#

[create closest, 3’ feature of the query interval for example, intron list: (100,200), (400,500), (700,900), query exon (500,700), return (700,900) if ‘+’ else (400,500)]

Parameters:

query_interval ([type]) – [description]
feature_type (str, optional) – [description]. Defaults to ‘intron’.
feature_name (str, optional) – [description]. Defaults to ‘terminal_exon’.

Returns:

[description]

Return type:

[type]

classmethod from_dict(coord_dict)#

[class method, generate metagene object from precomputed coordinates]

Parameters:: coord_dict ([type]) – [description]
Returns:: [description]
Return type:: [type]

get_average_feature(eCLIP, background_method='subtract', normalize=True, truncate=True, **kwargs)#

[fetching values for each feature, also averaging them if needed write values in self.truncate or self.densities, depending on truncate]

Parameters:

eCLIP ([type]) – [description]
background_method (str, optional) – [description]. Defaults to ‘subtract’.
normalize (bool, optional) – [description]. Defaults to True.
truncate (bool, optional) – [description]. Defaults to True.

get_raw_value(eCLIP, truncate=False, **kwargs)#

[fetch raw coverage/truncation value for whole gene write in self.coverage or self.sites]

Parameters:

eCLIP ([eCLIP]) – [eCLIP value]
truncate (bool, optional) – [use truncation or not. if not, fetch from bigwig]. Defaults to False.

Returns:

[description]

Return type:

[type]

get_value(eCLIP, background_method='subtract', normalize=True, truncate=True, **kwargs)#

[perform background removal, normalization and store in self.value[eCLIP.uID][rep_key]]

Parameters:

eCLIP ([type]) – [description]
background_method (str, optional) – [description]. Defaults to ‘subtract’.
normalize (bool, optional) – [description]. Defaults to True.
truncate (bool, optional) – [description]. Defaults to True.

multi_feature_avg(intervals, values, align='left', max_len=None)#

[average and align (zero padding) multiple intron/exon feature: list of intervals for multiple exons/cds/intron; ex: [(1,100), (2,950)] values: 5’ to 3’ processed value (remove background, normalize) of the whole gene align: align feature of different length to the left or right return: average feature (np.array)]

Parameters:

intervals ([list]) – [list of tuples, each tuple is a interval]
values ([np.array]) – [values of window from]
align (str, optional) – [‘left’ or ‘right’, 5 prime=left]. Defaults to ‘left’.
max_len ([int], optional) – [description]. Defaults to None.

Returns:

[description]

Return type:

[type]

order_multi_feature(feature='exon')#

[order multifeature from five prime to three prime end]

Parameters:: feature (str, optional) – [type of feature to order. ‘exon’ or ‘cds’]. Defaults to ‘exon’.
Returns:: [a list of features, 5 prime to 3 prime]
Return type:: [list]

remove_background(eCLIP, rep, method='subtract', truncate=False)#

[remove background by method method = ‘subtract’: subtract IP from Input method = ‘subtract normal’: subtract IP distribution from Input distribution method = ‘relative information’: take relative entropy method = ‘fold change’: log2(IP/IN) method = ‘get null’: get input density]

Parameters:

eCLIP ([eCLIP]) – [description]
rep ([str]) – [description]
method (str, optional) – [description]. Defaults to ‘subtract’.
truncate (bool, optional) – [description]. Defaults to False.

Returns:

[description]

Return type:

[type]

to_relative_axis(interval)#

[convert absolute genome position to relative axis, 0=five prime end of transcript/gene. Ex: self.start = 1000, self.stop = 5000, self.strand = -, interval = (1000,1005) return (5000-1005, 5000-1000) = (3995, 4000) if strand +, (1000-1000, 1005-1000)]

Parameters:: interval ([tuple]) – [tuple of (int, int), absolute genome position]
Returns:: [relative position of that interval, tuple of (int, int)]
Return type:: [tuple]

truncation_count(eCLIP, rep, interval, **kwargs)#

[fetch 5 prime end of read from the bam file. return a list of read start count for each position in feature ex: sites return [10,11,12,13]; feature = (10,15); return [1,1,1,1,0] count from 5’ to 3’]

Parameters:

eCLIP ([eCLIP]) – [eCLIP object]
rep ([str]) – [‘rep1’]
interval ([tuple]) – [(int, int), absolute genome position of things]

Returns:

[read start for each position of designated interval, 5 prime to 3 prime]

Return type:

[np.array]

window_feature(start, end, values)#: [slice from self.value] start: int, relative position of window end: int, relative end position sometimes there are window outside of gene length (from point feature)

class metadensity.metadensity.Metatruncate(eCLIP, name, sample_no=200, background_method='subtract', normalize=True, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None, **kwargs)#

Bases: Meta

[Metagene for 5’ read start]

Parameters:: Meta ([cls]) – [inherit from superclass Meta]

__init__(eCLIP, name, sample_no=200, background_method='subtract', normalize=True, metagenes=None, transcripts=None, transcript_ids=None, deep_dish_path=None, **kwargs)#

[Create a Metatruncate class (using 5 prime read truncation)]

Parameters:

eCLIP ([eCLIP]) – [eCLIP object containing the .bam .bw]
name ([str]) – [The name of this object. Will be used for plotting]
sample_no (int, optional) – [Number of transcripts to calculate. If is shorter than the transcripts specifies, then it will use the first n]. Defaults to 200.
background_method (str, optional) – [‘subtract’; ‘subtract normal’; ‘relative information’]. Defaults to ‘subtract’.
normalize (bool, optional) – [True to use % of edits in position; False to use raw background subtract signals]. Defaults to True.
metagenes ([dictionary of metagenes], optional) – [Use pre-built metagene. Will override sample_no, transcripts and transcript_ids]. Defaults to None.
transcripts ([BedTool], optional) – [Use BedTool containing transcripts]. Defaults to None.
transcript_ids ([list], optional) – [list of transcript or gene ids. the ids needs to be in the pre-built coodinate]. Defaults to None.
deep_dish_path ([str], optional) – [path to precomputed density array deepdish.h5. If specified, directly load from path and override all options]. Defaults to None.

get_truncation(background_method='subtract', normalize=True, **kwargs)#

[calculate metadensity for each metagene. store density in each metagene object]

Parameters:

background_method (str, optional) – [‘subtract’; ‘subtract normal’; ‘relative information’]. Defaults to ‘subtract’.
normalize (bool, optional) – [ True to use % of edits in position; False to use raw background subtract signals]. Defaults to True.

class metadensity.metadensity.Protein_coding_gene(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

build_full_cds(eCLIP, truncate=True)#

class metadensity.metadensity.RNU(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

class metadensity.metadensity.STAMP#

Bases: object

STAMP object to contain edit sites

__init__()#

class metadensity.metadensity.Window(start, end)#

Bases: object

__init__(start, end)#

class metadensity.metadensity.YRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

class metadensity.metadensity.circRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

metadensity.metadensity.density_array_entropy(f, fn)#: relative entropy for each position, f = positive binding density; fn = negative binding examples’s density

class metadensity.metadensity.eCLIP(name='RBP', uID=None, single_end=True, rep_keys=['rep1', 'rep2'], read_densities={}, peaks={}, idr=None, read2=True)#

Bases: object

eCLIP object to cotain .bigwig, .bam, .bed

__init__(name='RBP', uID=None, single_end=True, rep_keys=['rep1', 'rep2'], read_densities={}, peaks={}, idr=None, read2=True)#

calculate_coverage(coord=None)#

[Calculate coverage for all bams for intervals in coord] :param coord: [The intervals to count number of reads. For example, if set to transcript, :type coord: [BedTool], optional

Returns:: coveragedf

enough_coverage_transcript(coord=None, thres=0.001, key='rep1', attr_to_return='ID')#

[Filter for transcript id that has enough coverage in IP library]

Parameters:

coord ([BedTool], optional) – [The intervals to count number of reads. For example, if set to transcript,
transcript. (will return transcript id with at least "thres" reads. Has to be in gff3. Check if interval.attrs['ID'] exist. ]. Defaults to) –
thres (int, optional) – [the number of reads]. Defaults to 200.
key (str, optional) – [library to look at. Set to any keys in self.read_densities]. Defaults to rep1.

Returns:

[list of interval ids]

Return type:

[list]

enough_transcripts(coord=None, thres=200, key='rep1', attr_to_return='ID')#

[Filter for transcript id that has enough reads in IP]

Parameters:

coord ([BedTool], optional) – [The intervals to count number of reads. For example, if set to transcript,
transcript. (will return transcript id with at least "thres" reads. Has to be in gff3. Check if interval.attrs['ID'] exist. ]. Defaults to) –
thres (int, optional) – [the number of reads]. Defaults to 200.
key (str, optional) – [library to look at. Set to any keys in self.read_densities]. Defaults to rep1.

Returns:

[list of interval ids]

Return type:

[list]

find_idr_transcript(genome_coord=None)#

[find transcript containing at least 1 IDR peak]

Kwargs:: genome_coord: (BedTool) transcript coordintae. default are canonical transcripts.

classmethod from_series(series, single_end=True, read2=True)#

[build eCLIP object from series containing absolute path for .bam, .bw, .beds]

Parameters:

series ([pd.Series]) – keys include “RBP” for naming; “uid” for unique ID, has to be unique else will cross-contaminate other objects.
"bam_0" (or "bam_control_0", "bam_control_1") –
"bam_1" (or "bam_control_0", "bam_control_1") –
"bam_control" (or "bam_control_0", "bam_control_1") –
"plus_0" –
"plus_1" –
"minus_0" ("plus_control" for bigwigs filenames for the plus strand;) –
"minus_1" –
strand; ("minus_control" for bigwig filenames to the negtaive) –
"bed_0" –
peaks ("bed_1" for individual replicate) –
replicates. ("idr" for consensus peaks for multiple) –
technology (If your) –
strand-specific (is not) –
file. (set both strand's bigwig filenames to the same) –
_0 –
1 (_1 is for replicate) –
have. (2; you can extend to the number of reps you) –
control (If your technology has no Input or any sort of background) –
SMInput (If only 1) –
bam_control_0 (use) –
bam_control_1...etc. –
SMInput –
bam_control (simply omit _0. use) –
plue_control...etc –
control –
around. (set the controls to a random bam file and use "background_method=None" in Metadensity to hack) –
single_end (bool, optional) – [if your CLIP is done in single end set to True else Metatruncate will have problem]. Defaults to True.
read2 (bool, optional) – [Use read 2 only if sequencing is paired-end.]. Defaults to True.

Returns:

[a object that holds eCLIP data]

Return type:

[eCLIP]

class metadensity.metadensity.lncRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

class metadensity.metadensity.miRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

a class for miRNA’s features specifically

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

find_hairpin()#

metadensity.metadensity.nan_array()#

class metadensity.metadensity.rRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

class metadensity.metadensity.snoRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

class metadensity.metadensity.tRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

metadensity.metadensity.trim_or_pad(density, target_length, align='left', pad_value=0)#: make density your target length by trimming or padding

class metadensity.metadensity.vaultRNA(esnt, chro, start, end, strand, transcript_type, features=None)#

Bases: Metagene

__init__(esnt, chro, start, end, strand, transcript_type, features=None)#

[Create a Metagene object]

Parameters:

esnt ([string]) – [ID of the gene. store in Metagene.id]
chro ([string]) – [chromosome. store in Metagene.chrom]
start ([int]) – [start of the gene/transcript. self.start]
end ([int]) – [end of the gene/transcript. self.stop]
strand ([string]) – [has to be ‘+’ or ‘-‘]
transcript_type ([string]) – [type of transcript]
features ([list], optional) – [description]. Defaults to None.

self.featnames#

[for things to appear in density array, the name of the feature needs to be this list.]

Type:: [list]

self.coverage#

[raw coverage values from bigwig]

Type:: [dict]

self.sites#

[raw truncation values from bam]

Type:: [dict]

self.value#

[background removed, normalized value]

Type:: [dict]

self.densities#

Type:: [dict]

self.truncations#

Type:: [dict]

metadensity.plotd module#

Created on June, 2020 @author:Hsuan-lin Her Scripts to visualize meta-density

metadensity.plotd.auto_rbp_color(rbp_list)#

metadensity.plotd.beautify(fig, offset=10, trim=True, left=True)#: wrapper around sns.despine to make ugly python figures better-looking

metadensity.plotd.calcaulte_grid_width(features_to_show, ax_width_dict)#: calculate the number of grid needed for gridspec

metadensity.plotd.generate_axis(nrows=2, ax_width_dict={'CDS': 2, 'UTR': 1, 'branchpoint': 2, 'duplex': 2, 'exon': 2, 'hairpin': 2, 'intron': 3, 'mature': 2}, color_bar=False, feat_len_dict={'3p_duplex': 20, '5p_duplex': 20, 'CDS': 100, 'branchpoint': 50, 'branchpoint_pred': 50, 'exon': 100, 'first_CDS': 100, 'first_exon': 100, 'five_prime_UTR': 100, 'full_CDS': 200, 'hairpin': 10, 'intron': 500, 'last_CDS': 100, 'last_exon': 100, 'mature': 20, 'pri_mir': 500, 'start_codon': 3, 'stop_codon': 3, 'three_prime_UTR': 150}, features_to_show=['exon', 'intron'], anno=None, height=None)#: generate matplotlib ax for feature plots

metadensity.plotd.get_cmap(n, name='hsv')#: Returns a function that maps each index in 0, 1, …, n-1 to a distinct RGB color; the keyword argument name must be a standard mpl colormap name.

metadensity.plotd.get_xticklabels(feat, align, n_tick=5)#: create xticklabels for each kind of features

metadensity.plotd.make_concat_xtick(all_meta, features_to_show=['exon', 'intron'])#

metadensity.plotd.plot_enrich(metas, ymax=0.5, alpha=0.3, features_to_show=['exon', 'intron'], smooth=False, color_dict=None, mask=False, scatter=False, ymin=0, sigma=5)#: get a bunch of eCLIPs, plot their mean density

metadensity.plotd.plot_mean_density(metas, ymax=0.001, alpha=0.3, plot_std=True, stat='mean', features_to_show=['exon', 'intron'], smooth=False, color_dict=None, mask=False, ylabel=None, sigma=5, rep_handle='concat')#: get a bunch of eCLIPs, plot their mean density

metadensity.plotd.plot_mean_density_concat(metas, ymax=0.001, alpha=0.3, plot_std=True, stat='mean', features_to_show=['exon', 'intron'], smooth=False, color_dict=None, mask=False, ylabel=None, sigma=5, figsize=(7, 3))#: get a bunch of eCLIPs, plot their mean density

metadensity.plotd.plot_rbp_map(metas, alpha=0.6, ymax=0.001, features_to_show=['exon', 'intron'], sort=False, rep_handle='mean', cmap='Greys')#: get a bunch of Metadensity or Metatruncation Object, plot their individual density in a heatmap metas: list of Metadensity or Metatruncate object alpha: transparency in plt.plot() ymax: the max value in plt.set_ylim() features_to_show: list of genomic/transcriptomic features to show. options include all feature names. You can also use pre-set combinations such as generic_rna, protein_coding etc. Use metadensity.density_array to see what is available sort: whether to sort RNAs (rows in RBPmap) “lexicographically”. setting true will make RNA with similar binding pattern cluster together on a map rep_handle: ‘mean’ to plot the mean of 2 reps. ‘concat’ to show 2 reps individually. specify rep keys like ‘rep1’, ‘rep2’ to show only that rep. cmap: color map to use

metadensity.plotd.rep_merge_options(den_arr, feat, align, metaden_object, rep_handle)#: return the appropriate array depending how user wants to merge the replicate rep_handle: ‘mean’ to plot the mean of 2 reps. ‘concat’ to show 2 reps individually. specify rep keys like ‘rep1’, ‘rep2’ to show only that rep.

metadensity.pos_enrich module#

metadensity.pos_enrich.KS_enrich(meta_null, meta_ip, pval_thres=0.05, n_largest_to_remove=5, sigma=None, bidir=False)#: main function to calculate enrichment based on AUC?

class metadensity.pos_enrich.PosEnrichResult(pval_path, stat_path, name, pval_thres=0.05, test='KS')#

Bases: object

Positional Specific Enrichment Result

__init__(pval_path, stat_path, name, pval_thres=0.05, test='KS')#

metadensity.pos_enrich.Wilcox_enrich(meta_null, meta_ip, pval_thres=0.05, n_largest_to_remove=5, sigma=None)#: main function to calculate enrichment based on AUC?

metadensity.pos_enrich.auc_cum_dist(real, null, n_largest_to_remove=5, bidir=False)#: return AUC for a feature throughout all positions

metadensity.pos_enrich.auc_cum_wilcox(real, null, n_largest_to_remove=5)#: return AUC for a feature throughout all positions

metadensity.pos_enrich.construct_distribution(e, metagenes, use_truncate=True)#: Given eCLIP object, construct null and IP density

metadensity.pos_enrich.enrich_by_thres(meta_null, meta_ip, sigma=5, percentile=99)#: main function for percentile threshold enrichment

metadensity.pos_enrich.mask_by_null(real, thres)#: mask the density if not rejected

metadensity.pos_enrich.null_thres(null, perc=99)#: return threshold of each position that need to be rejected

metadensity.readdensity module#

Created on May 3, 2016 Module that helps containerize the CLIP density information. @author: Gabe, Brian

class metadensity.readdensity.Density#

Bases: object

values(chrom, start, end, strand)#

class metadensity.readdensity.Phastcon(phastcon, name=None)#

Bases: Density

__init__(phastcon, name=None)#

values(chrom, start, end, strand)#

Parameters:

chrom (basestring) – (eg. chr1)
start (int) – 0-based start (first position in chromosome is 0)
end (int) – 1-based end (last position is not included)
strand (str) – either ‘+’ or ‘-’

Returns:

densites – values corresponding to density over specified positions.

Return type:

list

class metadensity.readdensity.ReadDensity(pos, neg, name=None, bam=None)#

Bases: Density

ReadDensity class .. attribute:: self.pos

type:

positive *.bw file

self.neg#

Type:: negative *.bw file

__init__(pos, neg, name=None, bam=None)#

pseudocount()#: Returns the minimum normalized pseudocount of 1 read. :returns: rpm :rtype: float

rpm_to_r(rpm)#

Returns the raw read representation given a pseudocount :param rpm: rpm-normalized read density :type rpm: float

Returns:: r
Return type:: float

total_mapped()#: Returns the number of mapped reads :rtype: mapped

values(chrom, start, end, strand)#

Parameters:

chrom (basestring) – (eg. chr1)
start (int) – 0-based start (first position in chromosome is 0)
end (int) – 1-based end (last position is not included)
strand (str) – either ‘+’ or ‘-’

Returns:

densites – values corresponding to density over specified positions.

Return type:

list

metadensity.region_call module#

Created on Jul 28, 2020 @author: Hsuan-lin Her

metadensity.region_call.main(ip_bamfile, input_bamfile, features, n_upper=None, n_pool=8, timeout=1000, pval=0.001, fold_change=3, save_pickle=False)#: run_poission given eCLIP object

metadensity.region_call.nreads(bam, interval)#: pysam fetch but strand specific, return number reads mapped to region

metadensity.region_call.option_parser()#: return parser :return: OptionParser object

metadensity.region_call.poission_pval(smi_total, smi_region, ip_total, ip_region)#

metadensity.region_call.region_based_calling(interval, ip_bamfile, input_bamfile, ip_total, smi_total, pval_thres=0.05)#: determine whether feature interval i is enriched in IP

metadensity.sequence module#

metadensity.sequence.count_kmer(all_seqs, k=5)#: Count k-mer

metadensity.sequence.enrich_plot(stat, motif, r_thres=2, freq_thres=0.125, ax=None)#: plot R value vs IP count kmer enrichment plot

metadensity.sequence.getRNAsequence(interval)#: fetch RNA sequence from bedtool interval

metadensity.sequence.get_interval_seq(chrom, start, end, strand)#: get sequence around interval

metadensity.sequence.get_truncation_seq(chrom, start, strand, window=10)#: get sequence around window of truncation site

metadensity.sequence.kmer(s, k=5)#: given string, return all UNIQUE 5-mer

metadensity.sequence.kmer_rvalue(ip_seq, input_seq, k=5)#: Calculate Enrichment Value for each k-mer The frequency of kmer in IP / frequency of kmer in Input

metadensity.sequence.kmer_zscore(ip_seqs, mean, std, k=7)#: return z score for each k-mer based on control k-mer distribution

metadensity.sequence.simulate_kmer_background(input_seq, k=7, n_iter=100, n_sample=1000)#: from input sequence, sample a many times, calculate mean and std for each kmer)

metadensity.shape_from_read module#

metadensity.shape_from_read.ks_all_pos(w, w_in, p_thres=0.01)#: run KS test for all position

metadensity.shape_from_read.many_intervals(bam, data, filtered, num=50, window=10, single_end=False)#: for many genes that have shape data, return all the windows

metadensity.shape_from_read.window_around(data, start, end)#: return values from start:end, handles problems like start < 0 or end > data langth

metadensity.shape_from_read.window_around_eclip_sites(interval, data, bam, window=10, single_end=False)#: for a bedtool interval, return windows of shape values

metadensity.shape_from_read.windows_for_all(bam1, bam2, bam_in1, data, filtered, bam_in2=None, window=10, num=50, single_end=False)#

metadensity.shrink_region module#

class metadensity.shrink_region.Cluster(chrom, start, end, strand, ip_site, input_site)#

Bases: object

Read start Cluster

__init__(chrom, start, end, strand, ip_site, input_site)#

enrich_score(total_ip_in_region, total_input_in_region)#: calculate enrichment score by binom

make_fixed_size(size=30)#: make cluster fixed sized

to_bedstr()#: convert to bedtool compatible string format

metadensity.shrink_region.control_cluster(clusters, interval)#: given found cluster, return control cluster in the same interval of similar size

metadensity.shrink_region.find_cluster(interval, bam_fileobj, inputbam_fileobj, combine=True, eps=5, size=30, read2=True, cov_ratio=10)#: find read start cluster using DBSCAN interval: BedTool interval bam_fileobj: pysam IP bam inputbam_fileobj: pysam Input bam combine: if True, create cluster using both IP and Input reads; better for compare to Input. eps, min_samples: DBSCAN param

metadensity.shrink_region.get_cluster_seq(df)#

metadensity.shrink_region.main(ip_bamfile, input_bamfile, enrich_bed=None, n_upper=None, n_pool=8, timeout=1000, eps=2, min_samples=2, combine=True, size=30, tsv=None, use_ri=False, read2=True)#: DB scan for cluster ip_bamfile: input_bamfile: enrich_bed: bed file containing enriched region (produced by region_call.py)

metadensity.shrink_region.region_cluster(interval, bam, inputbam, combine=True, eps=2, cov_ratio=10, size=30, read2=True)#: find cluster within bedtools interval

metadensity.shrink_region.region_cluster_ri(interval, bam, inputbam, pseudocount=0.1, sigma=2, ri_height=0.02, size=30, read2=True)#: use RI to center the cluster instead of DBSCAN

metadensity.truncation module#

metadensity.truncation.fetch_reads(bam_fileobj, interval=None, chrom=None, start=None, end=None, strand=None, single_end=False, read2=True)#: fetch reads by pysam.fetch, but strand specific

metadensity.truncation.get_mismatch(profile)#: return # non-majority nucleotide in read

metadensity.truncation.read_start_sites(bam_fileobj, interval=None, chrom=None, start=None, end=None, strand=None, single_end=False, read2=True, included_diagnostic_events=['trun', 'mismatch', 'del'])#: return crosslinking events in bedtool interval from bamfile object; strand specific

metadensity.truncation.strand_specific_pileup(bam_fileobj, interval=None, chrom=None, start=None, end=None, strand=None, single_end=False, read2=True)#: pileup reads of a defined region, return profile (pd.DataFrame), count of truncation, deletion and nucleotides, index = genomic positions

metadensity.truncation.truncation_relative_axis(bam_fileobj, interval=None, chrom=None, start=None, end=None, strand=None, single_end=False, read2=True)#: return truncation + mismatch + indel count for each position at each site (5’ to 3’) in np.array()

metadensity package#

Submodules#

metadensity.config module#

metadensity.kmer_from_read module#

metadensity.metadensity module#

metadensity.plotd module#

metadensity.pos_enrich module#

metadensity.readdensity module#

metadensity.region_call module#

metadensity.sequence module#

metadensity.shape_from_read module#

metadensity.shrink_region module#

metadensity.truncation module#

Module contents#