This notebooks shows how you build the parsed data from all the files from the config files#
This will be of use when you: 1. are using some genome coordinates not provided in here 2. you need some other features built-in 3. include non-canonical transcripts
Prerequsites: 1. make sure files in config/COORD.ini are correctly specified.
Step 1: choose the coordinates#
[1]:
from pybedtools import BedTool
import pickle
import pandas as pd
import os
import sys
import metadensity as md
from pathlib import Path
if not os.path.isfile('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini'):
raise Exception("Config file does not exist. Create one by following https://github.com/algaebrown/Metadensity")
md.settings.from_config_file('/tscc/nfs/home/hsher/Metadensity/config/hg38_v40.ini')
please set the right config according to genome coordinate
Using /home/hsher/gencode_coords/GRCh38.p13.genome.fa
Using HG38 by default
Using /tscc/nfs/home/hsher/GRCh38.primary_assembly.genome.fa.gz
Step 2: check if basic file exists#
[2]:
# load gencode files, and also downloaded branchpoint files
# coord = BedTool(md.settings.gencode_feature_fname)
# transcripts = BedTool(md.settings.transcript_fname)
gencode = BedTool(md.settings.gencode_fname)
print(md.settings.gencode_fname)
/tscc/nfs/home/hsher/annotations/gencode.v40.primary_assembly.annotation.gff3.gz
Step 3: read branchpoints files#
[3]:
# sequencing branchpoints
if os.path.isfile(md.settings.branchpoint_fname):
branchpoint = BedTool(md.settings.branchpoint_fname)
print(f'found {len(branchpoint)} branchpoints in experiment')
else:
branchpoint = None
print(f'No branchpoint file provided in config file: {md.settings.branchpoint_fname}')
found 59328 branchpoints in experiment
[4]:
# machine learning branchpoints
if os.path.isfile(md.settings.branchpoint_pred_fname):
branchpoint_pred = pd.read_csv(md.settings.branchpoint_pred_fname)
# make to an interval
branchpoint_pred['start'] = branchpoint_pred['start']-1 # 1
branchpoint_pred_bed = BedTool.from_dataframe(branchpoint_pred[['chromosome', 'start', 'end', 'exon_id', 'exon_number', 'strand']])
print(f'found {len(branchpoint_pred_bed)} branchpoints in ML prediction')
else:
branchpoint_pred_bed = None
print(f'No ML branchpoint file provided in config file: {md.settings.branchpoint_pred_fname}')
found 367672 branchpoints in ML prediction
Step 4: read polyA files#
[5]:
# some criteria, see https://polyasite.unibas.ch/atlas
included_polyA_type = ['TE', 'EX', 'IN', 'DS']
[6]:
# make type specific annotation
def extract_polyA_signal_coordinates(subset_polyadf):
''' extracting the coordinate of polyA signal from polya dataframe'''
# extract polya signals
signal_coord = []
for index, row in subset_polyadf.iterrows():
if type(row['polyasignal'])==str:
polyasignals = row['polyasignal'].split(';')
for sig in polyasignals:
motif, rela_pos, obs_pos = sig.split('@')
signal_coord.append([row['chrom'],int(obs_pos),int(obs_pos)+1,
row['name'], row['polyatype'], row['strand']])
# make into bed
polysignal_df = pd.DataFrame(signal_coord,
columns= ['chrom', 'start', 'end', 'name', 'score', 'strand'])
polyasignal_bed = BedTool.from_dataframe(polysignal_df)
return polyasignal_bed
def polyAtype_specific_coords(polyAtype, polyadf):
'''
create polyA related feature for specific types of polyA
polyAtype: can be TE, EX, IN, DS.. see https://polyasite.unibas.ch/atlas
polyadf: pd.DataFrame from the tsv file
return:
poly_site_bed: BedTool object with polyA site annotation
polya_signal_bed: BedTool object with polyA signals
'''
# filter for specific types
subset_polyadf = polyadf.loc[polyadf['polyatype']==polyAtype]
# create bed of polyA sites
polya_site_bed = BedTool.from_dataframe(subset_polyadf)
polya_signal_bed = extract_polyA_signal_coordinates(subset_polyadf)
return polya_site_bed, polya_signal_bed
[7]:
if os.path.isfile(md.settings.polya_fname):
polyadf = pd.read_csv(os.path.join(md.settings.polya_fname),
sep = '\t',
header = None,
names = ['chrom', 'start', 'end', 'name', 'average expression', 'strand', 'perc_sample','n_sample', 'avg_tpm', 'polyatype', 'polyasignal'])
polyadf['chrom'] = 'chr'+polyadf['chrom'].astype(str)
# # filter for specific types
# polydf = polyadf.loc[polyadf['polyatype'].isin(included_polyA_type)]
# # make into bed
# polya = BedTool.from_dataframe(polyadf)
polya, polyasignal_bed = polyAtype_specific_coords('TE', polyadf = polyadf)
print(f'found {len(polya)} polyA sites, {len(polyasignal_bed)} polyA signals')
else:
polya = None
polyasignal_bed = None
print(f'no polyA annotation from file, {md.settings.polya_fname}')
no polyA annotation from file,
What Metadensity need is a dictionary pickled, saved in the data/ folder.#
Like in the build_transcript_dict function.
The dictionary contains several levels - First level: ensembl ID -> dictionary - Second level: (1 gene/1 transcript) -> information including chrom, start, end, strand and feature - Third level: feature dictionary {‘feature names’:-> set() of (start, end) tuples}
To add custom features names, you will need to modify the coorisponding Metagene child class. For example, you want to add polyA_site sites to a mRNA, then you need to add polyA_site to the self.featnames of the class Protein_coding_gene(Metagene).
Step 5: organize all features into a gene/transcript-based dictionary#
[8]:
# these functions may need to be modified depending on what file format your are inputting
def add_feature(all_dict, feature_bed, coords_with_id, name):
''' update external feature into transcript/gene centric annotations'''
overlap_transcript = feature_bed.intersect(coords_with_id, wb = True, s = True).saveas()
for feat in overlap_transcript:
transcript_id = feat[-1].split('transcript_id=')[1].split(';')[0]
gene_id = feat[-1].split('gene_id=')[1].split(';')[0]
for id_ in [transcript_id, gene_id]:
if name not in all_dict[id_]['features'].keys():
all_dict[id_]['features'][name] = set()
all_dict[id_]['features'][name].update([feat.start]) # POINT FEATURE ONLY!
def first_last_exon_cds(all_dict):
''' Designate the most 5' CDS/exon as first_CDS/exon, the most 3' CDS/exon as last_CDS/exon
'''
for key in all_dict.keys():
features = all_dict[key]['features']
#### exon
if 'exon' in features.keys():
min_start = min([e[0] for e in list(features['exon'])])
max_start = max([e[1] for e in list(features['exon'])])
if all_dict[key]['strand'] == '+':
features['first_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])
features['last_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])
else:
features['last_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])
features['first_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])
features['exon'] = features['exon'] - features['first_exon'] - features['last_exon']
else:
#del all_dict[key] # no exon, no need to keep
pass
## CDS
if 'CDS' in features.keys():
min_start = min([e[0] for e in list(features['CDS'])])
max_start = max([e[1] for e in list(features['CDS'])])
if all_dict[key]['strand'] == '+':
features['first_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])
features['last_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])
else:
features['last_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])
features['first_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])
features['CDS'] = features['CDS'] - features['first_CDS'] - features['last_CDS']
def five_three_utr(all_dict):
''' hg19 annotations don't have UTR. Designate 5 prime and 3 prime UTR'''
for key in all_dict.keys():
features = all_dict[key]['features']
#### exon
if 'UTR' in features.keys():
min_start = min([e[0] for e in list(features['UTR'])])
max_start = max([e[1] for e in list(features['UTR'])])
if all_dict[key]['strand'] == '+':
features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])
features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])
else:
features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])
features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])
del features['UTR']
def build_transcript_dict(gencode = gencode, outdir = md.settings.datadir,
branchpoint = branchpoint, branchpoint_pred_bed = branchpoint_pred_bed,
polya = polya, polyasignal_bed = polyasignal_bed):
''' extract gencode coordinate and save in data
gencode: BedTool, gencode.v33.annotations.gff3
outdir: md.settings.datadir
branchpoint: BedTool
branchpoint_pred: BedTool
polya: BedTool
polyasignal_bed: BedTool
'''
# make a directory for every coordinate
annotation_path = outdir
if not Path(annotation_path).is_file():
Path(annotation_path).mkdir(parents=True, exist_ok = True)
all_dict = {}
# filter for transcripts
transcript = gencode.filter(lambda x: x[2] == 'transcript').saveas()
exons = gencode.filter(lambda x: x[2] == 'exon').saveas()
intron = transcript.subtract(exons, s = True).saveas()
print('extracting from gencode annotations')
for g in gencode:
feature_type = g[2]
if feature_type == 'transcript' or feature_type == 'gene':
# start a new dict
all_dict[g.attrs['ID']] = {} # ENST or ENSG
all_dict[g.attrs['ID']]['chrom'] = g.chrom
all_dict[g.attrs['ID']]['start'] = g.start
all_dict[g.attrs['ID']]['end'] = g.end
all_dict[g.attrs['ID']]['strand'] = g.strand
all_dict[g.attrs['ID']]['id'] = g.attrs['ID']
all_dict[g.attrs['ID']]['type'] = g.attrs['gene_type']
all_dict[g.attrs['ID']]['name'] = g.attrs['gene_name']
all_dict[g.attrs['ID']]['features'] = {} # start to contain stuffs
else:
transcript_id = g.attrs['transcript_id'] # doesn't always equal to ID, for X,Y chromosome genes
gene_id = g.attrs['gene_id']
for ids in [transcript_id, gene_id]:
target_dict = all_dict[ids]
if feature_type not in target_dict['features'].keys():
target_dict['features'][feature_type] = set()
target_dict['features'][feature_type].update([(g.start, g.end)])
print('building intron')
for i in intron:
feature_type = 'intron'
transcript_id = i.attrs['transcript_id']
gene_id = i.attrs['gene_id']
for ids in [transcript_id, gene_id]:
target_dict = all_dict[ids]
if feature_type not in target_dict['features'].keys():
target_dict['features'][feature_type] = set()
target_dict['features'][feature_type].update([(i.start, i.end)])
print('building first last exon/cds')
first_last_exon_cds(all_dict)
print('building five/three prime UTR')
five_three_utr(all_dict)
print('building branchpoint')
if branchpoint is not None:
add_feature(all_dict, branchpoint, intron, name = 'branchpoint') # experimental
if branchpoint_pred_bed is not None:
add_feature(all_dict, branchpoint_pred_bed, intron, name = 'branchpoint_pred')
print('building polyA')
if polya is not None:
add_feature(all_dict, polya, transcript, name = 'polyAsite') # experimental
if polyasignal_bed is not None:
add_feature(all_dict, polyasignal_bed, transcript, name = 'polyAsignal') # experimental
print('writing to directory')
with open(os.path.join(annotation_path, 'gencode'), 'wb') as f:
pickle.dump(all_dict, f)
return all_dict
[9]:
d = build_transcript_dict()
***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:
GL000009.2 ENSEMBL exon 56140 58376 . - . ID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical
***** WARNING: File /tmp/pybedtools.ut5if3si.tmp has inconsistent naming convention for record:
GL000009.2 ENSEMBL exon 56140 58376 . - . ID=exon:ENST00000618686.1:1;Parent=ENST00000618686.1;gene_id=ENSG00000278704.1;transcript_id=ENST00000618686.1;gene_type=protein_coding;gene_name=ENSG00000278704;transcript_type=protein_coding;transcript_name=ENST00000618686;exon_number=1;exon_id=ENSE00003753029.1;level=3;protein_id=ENSP00000484918.1;transcript_support_level=NA;tag=basic,Ensembl_canonical
extracting from gencode annotations
building intron
building first last exon/cds
building five/three prime UTR
building branchpoint
***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:
GL000194.1 ENSEMBL transcript 55677 112791 . - . ID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical
***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:
GL000194.1 ENSEMBL transcript 55677 112791 . - . ID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical
***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:
GL000194.1 ENSEMBL transcript 55677 112791 . - . ID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical
***** WARNING: File /tmp/pybedtools.rcia93bn.tmp has inconsistent naming convention for record:
GL000194.1 ENSEMBL transcript 55677 112791 . - . ID=ENST00000613230.1;Parent=ENSG00000277400.1;gene_id=ENSG00000277400.1;transcript_id=ENST00000613230.1;gene_type=protein_coding;gene_name=ENSG00000277400;transcript_type=protein_coding;transcript_name=ENST00000613230;level=3;protein_id=ENSP00000483280.1;transcript_support_level=1;tag=basic,Ensembl_canonical
building polyA
writing to directory
[10]:
# take a look at what's inside
d[list(d.keys())[0]] #
[10]:
{'chrom': 'chr1',
'start': 11868,
'end': 14409,
'strand': '+',
'id': 'ENSG00000223972.5',
'type': 'transcribed_unprocessed_pseudogene',
'name': 'DDX11L1',
'features': {'exon': {(12009, 12057),
(12178, 12227),
(12612, 12697),
(12612, 12721),
(12974, 13052),
(13220, 13374),
(13452, 13670)},
'intron': {(12227, 12612), (12721, 12974), (13052, 13220)},
'first_exon': {(11868, 12227)},
'last_exon': {(13220, 14409)},
'branchpoint_pred': {12950, 13193, 13201}}}