In [1]:
# this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
altair_config = None
surface = None
#input files
e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
concat_df_file = None
merged_df_file = None
#output files
E2_E3_entry_corr_plot = None
E2_E3_entry_all_muts_plot = None
entry_region_boxplot_plot = None
combined_region_barplot_output = None
In [2]:
# Parameters
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
E2_E3_entry_all_muts_plot = "results/images/E2_E3_entry_all_muts_plot.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entry_region_boxplot_plot = "results/images/entry_region_boxplot_plot.html"
surface = "data/custom_analyses_data/surface_exposure.csv"
combined_region_barplot_output = "results/images/combined_region_barplot_output.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
Paths for running notebook interactively¶
In [4]:
if nipah_config is None:
e2_distances_file = "results/distances/2vsm_distances.csv"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
nipah_config = "nipah_config.yaml"
#altair_config = "data/custom_analyses_data/theme.py"
surface = "data/custom_analyses_data/surface_exposure.csv"
Read in custom altair theme and import YAML file with parameters¶
In [5]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import filtered data¶
In [6]:
#Import filtered entry scores from different selections
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
concat_df = pd.read_csv(concat_df_file) #concatanated entry scores
merged_df = pd.read_csv(merged_df_file) #merged entry scores
e2_distances = pd.read_csv(e2_distances_file)
Calculate some basic stats about E2 and E3 entry scores¶
In [7]:
def calculate_stats(df, name):
print(f"For {name}:")
total_mut = (532) * 19 #total number of possible amino acids
print(f'There are {total_mut} amino acid mutations possible')
muts_present = df["effect"].shape[0]
fraction_muts = muts_present / total_mut
print(
f"fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}"
)
# Break mutations into bins (negative, neutral, positive) and calculate how many mutants there are
deleterious_muts = df[df["effect"] <= -0.5].shape[0]
neutral_muts = df[(df["effect"] > -0.5) & (df["effect"] < 0.5)].shape[0]
positive_muts = df[df["effect"] > 0.5].shape[0]
frac_bad_muts = deleterious_muts / muts_present
frac_neutral_muts = neutral_muts / muts_present
frac_pos_muts = positive_muts / muts_present
print(
f"The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}"
)
print(
f"The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}"
)
print(
f"The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}\n"
)
calculate_stats(func_scores_E2, "CHO-bEFNB2")
calculate_stats(func_scores_E3, "CHO-bEFNB3")
For CHO-bEFNB2: There are 10108 amino acid mutations possible fraction muts present in CHO-bEFNB2 is 0.97 9786/10108 The number of deleterious mutants for CHO-bEFNB2 is 0.44 4296/9786 The number of neutral mutants for CHO-bEFNB2 is 0.54 5303/9786 The number of positive mutants for CHO-bEFNB2 is 0.02 186/9786 For CHO-bEFNB3: There are 10108 amino acid mutations possible fraction muts present in CHO-bEFNB3 is 0.96 9713/10108 The number of deleterious mutants for CHO-bEFNB3 is 0.43 4182/9713 The number of neutral mutants for CHO-bEFNB3 is 0.56 5451/9713 The number of positive mutants for CHO-bEFNB3 is 0.01 80/9713
Find which sites only have negative entry scores for all mutants¶
In [8]:
def overall_stats_all_neg(df,effect,name,region=None):
print(f'We are analyzing data for {name}:')
if region is None:
contact_df = df
else:
contact_df = df[df['site'].isin(region)]
filtered_df = contact_df.groupby('site').filter(lambda group: (group['effect'] < -2.25).all())
unique = filtered_df['site'].unique()
print(list(unique))
total_sites = contact_df['site'].unique().shape[0]
subset = filtered_df['site'].unique().shape[0]
fraction = subset/total_sites
percent = fraction * 100
print(f'The total number of sites are: {total_sites}\n')
print(f' The number of sites where all mutants are negative for {effect}: {subset}\n')
print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}\n')
return unique
intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect','CHO-bEFNB2'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect','CHO-bEFNB3',region=config['contact_sites']))
We are analyzing data for CHO-bEFNB2: [107, 121, 162, 263, 382, 387, 489, 499, 565] The total number of sites are: 532 The number of sites where all mutants are negative for effect: 9 The percent of sites where all mutants are negative for effect: 2 We are analyzing data for CHO-bEFNB3: [489, 506, 533, 557, 579, 581] The total number of sites are: 33 The number of sites where all mutants are negative for effect: 6 The percent of sites where all mutants are negative for effect: 18
Plot correlations between E2 and E3 entry¶
In [9]:
# merge distance data
distance_df = pd.merge(
merged_df, e2_distances[["site", "distance"]], on="site", how="left"
)
def determine_distance(df):
# Define the conditions
conditions = [
df["distance"] < 4,
(df["distance"] >= 4) & (df["distance"] <= 8),
df["distance"] > 8,
]
# Define the associated values for the conditions
choices = ["contact", "close", "distant"]
# Apply the conditions and choices to the 'E2_contact' column
df["contact"] = np.select(conditions, choices, default="distant")
return df
distance_df = determine_distance(distance_df)
In [10]:
def mean_correlation_plot(df, metric):
means = df.groupby('site')[["effect_E2", "effect_E3"]].mean().reset_index().round(2)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
means["effect_E2"], means["effect_E3"]
)
contact_sites = df[["site", "contact", "wildtype"]].drop_duplicates()
df_mean = pd.merge(means, contact_sites, on="site", how="left")
display(df_mean)
chart = alt.Chart(df_mean).mark_point(filled=True,size=50,stroke='black',strokeWidth=1,opacity=1).encode(
alt.X("effect_E2", title="Mean entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
alt.Y("effect_E3", title="Mean entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
tooltip=["wildtype","site","effect_E2",'effect_E3','contact'],
color=alt.Color(
"contact",
scale=alt.Scale(
domain=["contact", "close", "distant"],
range=["#4e79a7", "#f28e2c", "gray"],
),
legend=alt.Legend(title="Receptor distance"),
),
).properties(width=300,height=300)
#Add r value to chart
min_ = int(df_mean["effect_E2"].min())
max_ = int(df_mean["effect_E3"].max())
text = alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]}).mark_text(
align="left",
baseline="top",
dx=-20, # Adjust this for position
dy=-20, # Adjust this for position
).encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
plot = chart + text
return plot
E2_E3_plot = mean_correlation_plot(distance_df, "mean")
E2_E3_plot.display()
if entry_region_boxplot_plot is not None:
E2_E3_plot.save(E2_E3_entry_corr_plot)
site | effect_E2 | effect_E3 | contact | wildtype | |
---|---|---|---|---|---|
0 | 71 | -1.18 | -0.62 | distant | Q |
1 | 72 | -1.23 | -0.76 | distant | N |
2 | 73 | -0.74 | -0.46 | distant | Y |
3 | 74 | -0.68 | -0.44 | distant | T |
4 | 75 | -0.73 | -0.29 | distant | R |
... | ... | ... | ... | ... | ... |
527 | 598 | -1.04 | -0.82 | distant | P |
528 | 599 | 0.01 | 0.12 | distant | E |
529 | 600 | 0.07 | 0.18 | distant | Q |
530 | 601 | -1.06 | -0.77 | distant | C |
531 | 602 | 0.20 | 0.28 | distant | T |
532 rows × 5 columns
In [11]:
def correlation_plot(df):
df = df.dropna()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["effect_E2"], df["effect_E3"]
)
# setup interactive
variant_selector = alt.selection_point(
on="mouseover", empty=False, nearest=True, fields=["contact"], value=1
)
#plotting
chart = (
alt.Chart(df)
.mark_point(size=20,filled=True)
.encode(
x=alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
y=alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
tooltip=["site", "wildtype", "mutant"],
color=alt.Color(
"contact",
scale=alt.Scale(
domain=["contact", "close", "distant"],
range=["#4e79a7", "#f28e2c", "gray"],
),
legend=alt.Legend(title="Receptor distance"),
),
opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
).add_params(variant_selector)
).properties(height=300,width=300)
min_ = int(df['effect_E2'].min())
max_ = int(df['effect_E3'].max())
text = (
alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
.mark_text(
align="left",
baseline="top",
dx=-18, # Adjust this for position
dy=-40, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
plot = chart + text
return plot
tmp_img_corr = correlation_plot(distance_df)
tmp_img_corr.display()
if entry_region_boxplot_plot is not None:
tmp_img_corr.save(E2_E3_entry_all_muts_plot)
Make boxplot showing median entry by RBP region¶
In [12]:
def make_boxplot_entry_region(df):
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 603)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
tmp_df = df[df["cell_type"] == selection]
agg_means = []
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color='darkgray',extent='min-max')
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title="Cell entry",
axis=alt.Axis(tickCount=4),
),
tooltip=["region", "effect", "site"],
).properties(width=200,height=200)
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
if entry_region_boxplot_plot is not None:
entry_region_boxplot.save(entry_region_boxplot_plot)
Check for potential neutral/beneficial glycosylation sites¶
In [13]:
def find_potential_glycan_sites(df):
filtered = df[df["wildtype"].isin(["T", "S"])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[
(df["site"] == row["site"] - 2) & (df["mutant"] == "N") & (df["effect"] > 0)
]
if not prior_rows.empty:
matching_sites.append(row["site"])
unique_list1 = list(set(matching_sites))
unique_list1 = [x - 2 for x in unique_list1]
filtered = df[df["wildtype"].isin(["N"])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[
(df["site"] == row["site"] + 2)
& (df["mutant"].isin(["T", "S"]))
& (df["effect"] > 0)
]
if not prior_rows.empty:
matching_sites.append(row["site"])
unique_list2 = list(set(matching_sites))
unique_list = unique_list1 + unique_list2
unique_list.sort()
return unique_list
#call function
PNLG = find_potential_glycan_sites(func_scores_E3)
#read in surface exposure data to find potential glycans on surface of protein
surface_df = pd.read_csv(surface)
surface_df.rename(columns={"exposure_A": "Surface Exposure"}, inplace=True)
PNLG_SURFACE = surface_df[surface_df["site"].isin(PNLG)]
PNLG_SURFACE = list(
PNLG_SURFACE[PNLG_SURFACE["Surface Exposure"] >= 25]["site"].unique()
)
#print(f"\nThe surface exposed PNLG sites are: {PNLG_SURFACE}\n")
glycans = config["glycans"] #these are the glycan sites that are already present in protein
#filter out known glycan sites already present
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(f'These are {len(filtered_PNLG_SURFACE)} potential glycan sites one amino acid away: {filtered_PNLG_SURFACE}')
#print(len(filtered_PNLG_SURFACE))
These are 15 potential glycan sites one amino acid away: [180, 191, 192, 311, 326, 359, 383, 403, 423, 473, 478, 543, 554, 570, 600]
Make bar plot of average entry scores for CHO-bEFNB3¶
In [14]:
def entry_by_site(df):
tmp_df = df.groupby('site')['effect'].mean().reset_index()
# define ranges of different RBP regions
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 603)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
agg_means = [] #store aggregation
# For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means).round(2)
agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
### The main chart plotting
chart = (
alt.Chart(agg_means_df)
.mark_bar(opacity=1,color='black',binSpacing=0)
.encode(
x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
y=alt.Y("effect", title="Mean entry in CHO-bEFNB3"),
tooltip=["site", "effect","region"],
color=alt.Color('region',sort=custom_order,title='Region',scale=alt.Scale(range=['#5778a4', '#e49444', '#d1615d', '#85b6b2'])),
)
).properties(width=alt.Step(1.5),height=150)
### Draw rectanges showing where beta sheets are in protein above chart
rect = alt.Chart(agg_means_df.query('beta_sheet == True')).mark_rect(color='black').encode(
alt.X('site:N',axis=None),
alt.Y('beta_sheet',axis=None)
).properties(width=800,height=10)
combined_chart = alt.vconcat(chart,padding=0).resolve_scale(y='independent',x='shared')
return combined_chart
entry_by_site_plot = entry_by_site(func_scores_E3)
entry_by_site_plot.display()
Make bar charts of mean cell entry by site for different regions, sorted by average and colored by the unmutated amino acid at position¶
In [15]:
def entry_by_site_region(df,site_list,name_of_title):
#calculate mean by site
tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index().round(2)
#make a unique values df to merge
unique_wildtypes_df = df.drop_duplicates(subset=['site','wildtype','wt_type','wildtype_site'])
#merge mean and unique values
tmp_df = pd.merge(tmp_df, unique_wildtypes_df[['site', 'wt_type','wildtype','wildtype_site']], on='site', how='left')
#filter by site
tmp_df = tmp_df[tmp_df['site'].isin(site_list)]
#make chart
chart = (
alt.Chart(tmp_df,title=name_of_title)
.mark_bar(opacity=1,color='black')
.encode(
y=alt.Y("wildtype_site:N", sort='-x',title='Site'),
x=alt.X("effect", title="Mean entry of mutants"),
tooltip=["site", "effect", 'wildtype', 'wt_type'],
#color=alt.Color('wt_type:N', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Unmutated amino-acid type"))
)
).properties(width=100, height=alt.Step(14))
return chart
Ranked average entry in contact sites¶
In [16]:
entry_by_site_receptor_plot = entry_by_site_region(func_scores_E3,config['contact_sites'],'Contact Sites')
entry_by_site_receptor_plot.display()
Now make ranked averages for each region¶
Ranked average entry in RBP stalk¶
In [17]:
entry_by_site_receptor_plot_stalk = entry_by_site_region(func_scores_E3,list(range(70, 148)),'Stalk')
entry_by_site_receptor_plot_stalk.display()
Ranked average entry in RBP neck¶
In [18]:
entry_by_site_receptor_plot_neck = entry_by_site_region(func_scores_E3,list(range(148, 166)),'Neck')
entry_by_site_receptor_plot_neck.display()
Ranked average entry in RBP linker¶
In [19]:
entry_by_site_receptor_plot_linker = entry_by_site_region(func_scores_E3,list(range(166, 178)),'Linker')
entry_by_site_receptor_plot_linker.display()
In [20]:
combined_region_barplot = alt.hconcat(entry_by_site_receptor_plot_neck,entry_by_site_receptor_plot,padding=0).resolve_scale(y="independent", x="shared", color="shared")
combined_region_barplot.display()
combined_region_barplot.save(combined_region_barplot_output)
In [ ]:
In [ ]: