ephrin_binding.ipynb¶
Analyze effect of RBP mutations on receptor binding from DMS selection data using soluble bat Ephrin-B2 or -B3
- Written by Brendan Larsen
Papermill parameterization¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
nipah_config = None
# input files
binding_E2_file = None
binding_E3_file = None
# output images
entry_binding_combined_corr_plot = None
E2_E3_correlation = None
E2_E3_correlation_site = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
"results/images/entry_binding_combined_corr_plot.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
Import libraries, setup working directory, and import altair themes¶
In [3]:
import os
import math
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
Set hard paths for running in interactive mode¶
In [4]:
if nipah_config is None:
print("loading hard paths")
nipah_config = "nipah_config.yaml"
# input files
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
Import config variables¶
In [5]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import and merge data¶
Import the filtered binding and entry data for bEFNB2 and bEFNB3¶
In [6]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
df_E3_filter = pd.read_csv(binding_E3_file)
Merge data¶
In [7]:
## Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=["site", "wildtype", "mutant"],
suffixes=["_E2", "_E3"],
how="outer",
)
display(df_binding_effect_merge.head(3))
## Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "bEFNB2"
df_E3_filter["selection"] = "bEFNB3"
## Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | mutant_type_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | mutant_type_E3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 | Negative | -0.29370 | 0.3305 | 4.0 | -0.3884 | 0.6369 | 3.429 | 1.0 | Negative |
1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 | Negative | -0.35270 | 0.1762 | 4.0 | -0.2482 | 0.9791 | 4.571 | 1.0 | Negative |
2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 | Aromatic | -0.03982 | 0.1283 | 3.0 | -0.4973 | 0.3080 | 3.286 | 1.0 | Aromatic |
Calculate stats¶
In [8]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
df = df.round(2)
print(f"We are analyzing {name}\n")
print(f"The total number of mutants was: {df.shape[0]}\n")
tmp_df = df.sort_values(by="binding_mean")
print("These are the lowest binding mutants detected:")
display(tmp_df.head(5))
tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
print("\nThese are the highest binding mutants detected:\n")
display(tmp_df_high.head(5))
# What about mutants with positive entry scores?
tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
print("These are the lowest binding mutants detected with positive entry scores:")
display(tmp_df.head(5))
tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
print(
"\nThese are the highest binding mutants detected with positive entry scores:\n"
)
display(tmp_df_high.head(5))
mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
print('These are the sites with the highest summed binding score:\n')
display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))
find_highest_lowest(df_E2_filter, "bEFNB2")
find_highest_lowest(df_E3_filter, "bEFNB3")
We are analyzing bEFNB2 The total number of mutants was: 6672 These are the lowest binding mutants detected:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5369 | 501 | E | K | -5.28 | 0.93 | 24.50 | -0.64 | 0.61 | 32.00 | 1.0 | Positive | bEFNB2 |
5744 | 532 | A | V | -5.26 | 1.14 | 16.25 | -0.32 | 0.31 | 17.88 | 1.0 | Hydrophobic | bEFNB2 |
1770 | 236 | R | H | -5.05 | 0.88 | 22.00 | -1.25 | 0.34 | 29.38 | 1.0 | Positive | bEFNB2 |
5240 | 490 | Q | K | -4.97 | 1.08 | 8.50 | -0.69 | 0.54 | 11.38 | 1.0 | Positive | bEFNB2 |
5238 | 490 | Q | H | -4.75 | 0.86 | 8.25 | -0.45 | 0.67 | 10.88 | 1.0 | Positive | bEFNB2 |
These are the highest binding mutants detected:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6365 | 580 | I | S | 2.15 | 1.77 | 4.75 | -1.00 | 0.69 | 6.38 | 1.0 | Hydrophilic | bEFNB2 |
2969 | 331 | N | E | 2.14 | 1.23 | 3.00 | -0.89 | 0.84 | 3.50 | 1.0 | Negative | bEFNB2 |
6468 | 586 | N | T | 1.96 | 0.58 | 5.25 | -0.82 | 0.82 | 7.00 | 1.0 | Hydrophilic | bEFNB2 |
1457 | 211 | G | F | 1.96 | 0.55 | 4.75 | -0.67 | 0.43 | 5.12 | 1.0 | Aromatic | bEFNB2 |
6040 | 553 | S | W | 1.94 | 0.29 | 10.00 | -0.30 | 0.43 | 13.25 | 1.0 | Aromatic | bEFNB2 |
These are the lowest binding mutants detected with positive entry scores:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6102 | 558 | A | V | -4.65 | 0.73 | 17.25 | 0.12 | 0.36 | 18.50 | 1.0 | Hydrophobic | bEFNB2 |
6089 | 557 | N | D | -4.63 | 0.40 | 9.50 | 0.38 | 0.48 | 11.12 | 1.0 | Negative | bEFNB2 |
5654 | 526 | L | H | -4.55 | 0.78 | 5.25 | 0.11 | 0.39 | 6.75 | 1.0 | Positive | bEFNB2 |
6383 | 581 | Y | W | -4.52 | 1.01 | 10.75 | 0.09 | 0.41 | 11.50 | 1.0 | Aromatic | bEFNB2 |
5252 | 491 | S | H | -4.50 | 0.94 | 8.50 | 0.11 | 0.25 | 10.25 | 1.0 | Positive | bEFNB2 |
These are the highest binding mutants detected with positive entry scores:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2264 | 282 | C | S | 1.61 | 1.06 | 4.50 | 0.34 | 0.33 | 3.71 | 0.5 | Hydrophilic | bEFNB2 |
4727 | 450 | Q | I | 1.46 | 1.19 | 5.00 | 0.14 | 0.40 | 6.00 | 1.0 | Hydrophobic | bEFNB2 |
787 | 164 | N | S | 1.30 | 1.14 | 3.75 | 0.24 | 0.29 | 6.12 | 1.0 | Hydrophilic | bEFNB2 |
6173 | 564 | N | K | 1.26 | 0.49 | 7.25 | 0.15 | 0.44 | 8.00 | 1.0 | Positive | bEFNB2 |
3633 | 376 | K | F | 1.21 | 1.78 | 3.25 | 0.17 | 0.27 | 2.62 | 1.0 | Aromatic | bEFNB2 |
These are the sites with the highest summed binding score:
site | binding_mean | |
---|---|---|
248 | 331 | 15.73 |
471 | 564 | 15.10 |
95 | 172 | 13.87 |
491 | 586 | 13.38 |
223 | 306 | 12.34 |
101 | 178 | 9.28 |
149 | 227 | 8.72 |
237 | 320 | 8.57 |
199 | 282 | 8.46 |
90 | 167 | 8.39 |
We are analyzing bEFNB3 The total number of mutants was: 6503 These are the lowest binding mutants detected:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6013 | 555 | D | K | -2.15 | 1.23 | 6.0 | 0.09 | 0.42 | 6.71 | 1.0 | Positive | bEFNB3 |
6016 | 555 | D | R | -1.51 | 0.42 | 4.0 | 0.08 | 0.53 | 4.14 | 1.0 | Positive | bEFNB3 |
5699 | 530 | Q | L | -1.45 | 0.48 | 3.5 | -1.02 | 0.86 | 5.86 | 1.0 | Hydrophobic | bEFNB3 |
6301 | 583 | T | R | -1.41 | 0.45 | 5.5 | -1.00 | 0.49 | 6.29 | 1.0 | Positive | bEFNB3 |
6298 | 583 | T | K | -1.25 | 0.11 | 5.0 | -0.73 | 0.64 | 7.00 | 1.0 | Positive | bEFNB3 |
These are the highest binding mutants detected:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6347 | 589 | R | G | 2.01 | 0.66 | 7.0 | -0.97 | 0.58 | 8.29 | 1.0 | Special | bEFNB3 |
6267 | 580 | I | M | 1.34 | 0.53 | 5.5 | -0.82 | 0.53 | 6.71 | 1.0 | Hydrophobic | bEFNB3 |
2298 | 270 | V | Q | 1.33 | 1.03 | 5.0 | -0.88 | 0.65 | 5.43 | 1.0 | Hydrophilic | bEFNB3 |
2251 | 267 | M | Q | 1.29 | 1.37 | 4.5 | -0.80 | 0.82 | 5.67 | 1.0 | Hydrophilic | bEFNB3 |
5352 | 492 | Q | L | 1.26 | 0.07 | 5.5 | 0.48 | 0.18 | 5.14 | 1.0 | Hydrophobic | bEFNB3 |
These are the lowest binding mutants detected with positive entry scores:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6013 | 555 | D | K | -2.15 | 1.23 | 6.0 | 0.09 | 0.42 | 6.71 | 1.0 | Positive | bEFNB3 |
6016 | 555 | D | R | -1.51 | 0.42 | 4.0 | 0.08 | 0.53 | 4.14 | 1.0 | Positive | bEFNB3 |
5702 | 530 | Q | W | -1.00 | 0.55 | 3.0 | 0.02 | 0.66 | 3.29 | 1.0 | Aromatic | bEFNB3 |
6007 | 555 | D | A | -0.83 | 0.20 | 3.5 | 0.38 | 0.23 | 3.71 | 1.0 | Hydrophobic | bEFNB3 |
4751 | 441 | P | V | -0.75 | 0.18 | 5.5 | 0.37 | 0.09 | 5.29 | 1.0 | Hydrophobic | bEFNB3 |
These are the highest binding mutants detected with positive entry scores:
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | selection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5352 | 492 | Q | L | 1.26 | 0.07 | 5.5 | 0.48 | 0.18 | 5.14 | 1.0 | Hydrophobic | bEFNB3 |
1665 | 211 | G | F | 1.17 | 0.29 | 5.0 | 0.16 | 0.57 | 5.71 | 1.0 | Aromatic | bEFNB3 |
6429 | 598 | P | G | 1.15 | 1.56 | 4.5 | 0.24 | 0.51 | 5.43 | 1.0 | Special | bEFNB3 |
5987 | 553 | S | W | 0.99 | 0.44 | 9.5 | 0.17 | 0.41 | 9.86 | 1.0 | Aromatic | bEFNB3 |
6290 | 582 | D | W | 0.98 | 0.34 | 6.5 | 0.06 | 0.31 | 7.14 | 1.0 | Aromatic | bEFNB3 |
These are the sites with the highest summed binding score:
site | binding_mean | |
---|---|---|
489 | 589 | 11.70 |
86 | 161 | 7.33 |
468 | 564 | 6.73 |
224 | 306 | 6.50 |
59 | 132 | 6.17 |
241 | 323 | 6.10 |
469 | 566 | 5.97 |
260 | 342 | 5.72 |
128 | 204 | 5.64 |
174 | 254 | 5.14 |
In [9]:
def overall_stats(df, effect, name):
# Now group sites and find sites where all mutants are deleterious
filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
# Which sites are these?
unique = filtered_df["site"].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
# Find the common elements that are also contact sites
unique_contact_bool = unique_series.isin(config["contact_sites"])
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
print(f"The dataset we are analyzing is: {name}\n")
# Print the common elements
print(
f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
)
print(f"There are {len(unique)} sites with all negative binding score mutants\n")
print(
f"These are the sites with all negative binding score mutants: {list(unique)}\n"
)
# Now find sites with low and high binding (median)
median_df = (
df.groupby("site")["binding_mean"]
.max()
.reset_index()
.sort_values(by="binding_mean", ascending=False)
)
print("These are the sites with the highest binding mutants:\n")
display(median_df.head(5))
# Now calculate mutant number
total_mutants = df.shape[0]
mutants_above_cutoff_tolerated = df[df["effect"] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
["site", "effect", "binding_mean", "wildtype", "mutant"]
]
total_sites = df["site"].unique().shape[0]
print(f"The total number of sites are: {total_sites}")
overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2 Here are the contact sites that only have negative binding scores: [239, 242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588] There are 43 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 239, 242, 243, 248, 346, 351, 352, 389, 390, 399, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590] These are the sites with the highest binding mutants:
site | binding_mean | |
---|---|---|
485 | 580 | 2.152 |
248 | 331 | 2.144 |
491 | 586 | 1.962 |
134 | 211 | 1.957 |
460 | 553 | 1.945 |
The total number of sites are: 508 The dataset we are analyzing is: EFNB3 Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532] There are 16 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [108, 140, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584] These are the sites with the highest binding mutants:
site | binding_mean | |
---|---|---|
489 | 589 | 2.006 |
481 | 580 | 1.337 |
188 | 270 | 1.329 |
185 | 267 | 1.291 |
404 | 492 | 1.261 |
The total number of sites are: 503
Find sites with opposite effects on binding¶
In [10]:
# find sites that are different
def find_biggest_differences(df):
df = df.round(2)
efnb2_good_efnb3_bad = df[
(df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
].sort_values(by="binding_mean_E2", ascending=False)
print('mutantions good for efnb2 binding, bad for efnb3 binding:\n')
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
].sort_values(by="binding_mean_E3", ascending=False)
print('mutantions bad for efnb2 binding, good for efnb3 binding:\n')
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_binding_effect_merge)
mutantions good for efnb2 binding, bad for efnb3 binding:
site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | mutant_type_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | mutant_type_E3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5924 | 546 | L | H | 1.52 | 1.14 | 5.25 | -1.15 | 0.39 | 6.75 | 1.0 | Positive | -0.54 | 0.67 | 3.5 | -0.85 | 0.54 | 5.14 | 1.0 | Positive |
2543 | 303 | P | C | 1.34 | 1.58 | 5.00 | -0.72 | 0.59 | 5.00 | 1.0 | Special | -0.57 | 0.10 | 4.0 | -0.70 | 0.59 | 4.86 | 1.0 | Special |
112 | 79 | N | S | 1.29 | 1.65 | 3.00 | -0.64 | 0.50 | 2.38 | 1.0 | Hydrophilic | -0.65 | 0.26 | 3.0 | -0.13 | 0.57 | 3.43 | 1.0 | Hydrophilic |
93 | 78 | D | I | 0.66 | 0.63 | 5.25 | -0.93 | 0.40 | 6.50 | 1.0 | Hydrophobic | -0.57 | 0.76 | 5.0 | -0.52 | 0.84 | 5.14 | 1.0 | Hydrophobic |
1637 | 224 | M | Q | 0.65 | 0.88 | 3.50 | 0.17 | 0.31 | 2.75 | 1.0 | Hydrophilic | -0.58 | 0.38 | 3.5 | 0.21 | 0.29 | 4.29 | 1.0 | Hydrophilic |
5591 | 520 | I | G | 0.59 | 1.01 | 5.00 | -1.46 | 0.35 | 8.00 | 1.0 | Special | -0.53 | 0.15 | 5.5 | -1.31 | 0.32 | 7.00 | 1.0 | Special |
996 | 178 | V | T | 0.53 | 0.73 | 3.75 | -0.53 | 0.76 | 4.25 | 1.0 | Hydrophilic | -0.51 | 0.60 | 3.0 | -0.02 | 0.43 | 3.57 | 1.0 | Hydrophilic |
mutantions bad for efnb2 binding, good for efnb3 binding:
site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | mutant_type_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | mutant_type_E3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6498 | 588 | I | P | -2.35 | 0.84 | 5.25 | -0.26 | 0.30 | 4.75 | 1.0 | Special | 1.18 | 0.26 | 5.5 | -0.61 | 0.38 | 5.43 | 1.0 | Special |
6602 | 598 | P | G | -0.55 | 0.79 | 5.00 | -0.87 | 0.94 | 4.62 | 1.0 | Special | 1.15 | 1.56 | 4.5 | 0.24 | 0.51 | 5.43 | 1.0 | Special |
3093 | 338 | R | G | -0.52 | 0.51 | 4.50 | 0.30 | 0.42 | 5.25 | 1.0 | Special | 0.68 | 0.98 | 4.0 | 0.16 | 0.34 | 3.43 | 1.0 | Special |
5248 | 491 | S | A | -0.93 | 0.54 | 5.50 | -0.67 | 0.81 | 6.25 | 1.0 | Hydrophobic | 0.65 | 0.01 | 6.5 | 0.14 | 0.36 | 6.14 | 1.0 | Hydrophobic |
5274 | 492 | Q | R | -2.82 | 0.61 | 19.25 | -0.08 | 0.27 | 23.12 | 1.0 | Positive | 0.64 | 0.05 | 20.0 | 0.51 | 0.12 | 20.57 | 1.0 | Positive |
Find the top overlapping binders for both bEFNB2 and bEFNB3¶
In [11]:
def find_good_binding_for_both(df):
e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
e2_good = e2_good.head(50)
e3_good = e3_good.head(50)
combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
display(combo)
find_good_binding_for_both(df_binding_effect_merge)
site | wildtype | mutant | binding_mean_E2_x | binding_std_E2_x | times_seen_binding_E2_x | effect_E2_x | effect_std_E2_x | times_seen_cell_entry_E2_x | frac_models_E2_x | ... | frac_models_E2_y | mutant_type_E2_y | binding_mean_E3_y | binding_std_E3_y | times_seen_binding_E3_y | effect_E3_y | effect_std_E3_y | times_seen_cell_entry_E3_y | frac_models_E3_y | mutant_type_E3_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 580 | I | S | 2.152 | 1.7680 | 4.750 | -1.0010 | 0.6921 | 6.375 | 1.00 | ... | 1.00 | Hydrophilic | 0.8187 | 0.030890 | 6.0 | 0.1163 | 0.3235 | 5.571 | 1.0 | Hydrophilic |
1 | 211 | G | F | 1.957 | 0.5488 | 4.750 | -0.6654 | 0.4262 | 5.125 | 1.00 | ... | 1.00 | Aromatic | 1.1690 | 0.289000 | 5.0 | 0.1637 | 0.5716 | 5.714 | 1.0 | Aromatic |
2 | 553 | S | W | 1.945 | 0.2867 | 10.000 | -0.2967 | 0.4293 | 13.250 | 1.00 | ... | 1.00 | Aromatic | 0.9943 | 0.435200 | 9.5 | 0.1669 | 0.4060 | 9.857 | 1.0 | Aromatic |
3 | 139 | N | W | 1.930 | 0.4834 | 5.750 | -0.6672 | 0.6217 | 8.000 | 1.00 | ... | 1.00 | Aromatic | 1.0030 | 0.004575 | 4.5 | -0.4441 | 0.4452 | 6.286 | 1.0 | Aromatic |
4 | 589 | R | G | 1.668 | 0.9018 | 7.333 | -1.3800 | 0.4560 | 11.880 | 0.75 | ... | 0.75 | Special | 2.0060 | 0.659100 | 7.0 | -0.9655 | 0.5763 | 8.286 | 1.0 | Special |
5 | 306 | N | R | 1.523 | 1.8470 | 3.750 | -1.2790 | 0.4835 | 5.625 | 1.00 | ... | 1.00 | Positive | 0.9003 | 0.713800 | 4.5 | -0.8509 | 0.5830 | 6.286 | 1.0 | Positive |
6 rows × 35 columns
Find sites with the largest absolute difference in binding¶
In [12]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
df = df.round(2)
df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
print(
"These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
)
display(df.sort_values(by="binding_diff", ascending=False).head(10))
# calculate aggregate differences
agg_df = (
df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
.mean()
.reset_index()
)
agg_df = agg_df.round(2)
print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))
find_highest_lowest(df_binding_effect_merge)
These are the mutants with the biggest difference between EFNB2 and EFNB3:
site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | mutant_type_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | mutant_type_E3 | binding_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5718 | 530 | Q | F | -4.01 | 0.47 | 6.25 | 0.47 | 0.20 | 5.38 | 1.0 | Aromatic | 0.08 | 0.06 | 6.0 | 0.36 | 0.27 | 6.29 | 1.0 | Aromatic | 4.09 |
5247 | 490 | Q | Y | -4.29 | 0.90 | 5.50 | -1.09 | 0.40 | 6.38 | 1.0 | Aromatic | -0.46 | 0.57 | 5.5 | -0.31 | 0.80 | 5.71 | 1.0 | Aromatic | 3.83 |
5252 | 491 | S | H | -4.50 | 0.94 | 8.50 | 0.11 | 0.25 | 10.25 | 1.0 | Positive | -0.87 | 0.79 | 7.0 | -1.38 | 0.49 | 8.29 | 1.0 | Positive | 3.63 |
5251 | 491 | S | G | -4.18 | 0.59 | 6.50 | 0.13 | 0.30 | 8.62 | 1.0 | Special | -0.56 | 0.05 | 5.5 | -0.24 | 0.61 | 6.29 | 1.0 | Special | 3.62 |
5246 | 490 | Q | W | -4.20 | 1.00 | 3.75 | 0.23 | 0.38 | 4.75 | 1.0 | Aromatic | -0.59 | 0.32 | 3.0 | -0.15 | 0.46 | 3.29 | 1.0 | Aromatic | 3.61 |
6498 | 588 | I | P | -2.35 | 0.84 | 5.25 | -0.26 | 0.30 | 4.75 | 1.0 | Special | 1.18 | 0.26 | 5.5 | -0.61 | 0.38 | 5.43 | 1.0 | Special | 3.53 |
5270 | 492 | Q | K | -3.52 | 0.55 | 11.00 | 0.35 | 0.23 | 11.62 | 1.0 | Positive | 0.01 | 0.08 | 10.0 | 0.33 | 0.24 | 11.57 | 1.0 | Positive | 3.53 |
5274 | 492 | Q | R | -2.82 | 0.61 | 19.25 | -0.08 | 0.27 | 23.12 | 1.0 | Positive | 0.64 | 0.05 | 20.0 | 0.51 | 0.12 | 20.57 | 1.0 | Positive | 3.46 |
1814 | 239 | S | H | -3.44 | 0.09 | 5.25 | 0.33 | 0.27 | 4.50 | 1.0 | Positive | -0.04 | 0.83 | 4.0 | -1.22 | 0.69 | 5.00 | 1.0 | Positive | 3.40 |
5727 | 530 | Q | V | -4.33 | 0.84 | 6.00 | 0.11 | 0.27 | 8.62 | 1.0 | Hydrophobic | -0.99 | 0.90 | 5.5 | -1.44 | 0.74 | 7.00 | 1.0 | Hydrophobic | 3.34 |
These are the sites with the biggest difference between EFNB2 and EFNB3:
site | binding_mean_E2 | binding_mean_E3 | binding_diff | |
---|---|---|---|---|
415 | 505 | -3.79 | -0.77 | 2.90 |
439 | 530 | -3.63 | -0.56 | 2.80 |
405 | 491 | -3.58 | -0.36 | 2.78 |
495 | 588 | -3.50 | 0.04 | 2.69 |
404 | 490 | -3.81 | -0.37 | 2.50 |
Make plots¶
Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶
In [13]:
def plot_corr_binding_entry_updated(df, flag):
df = df.copy().round(2)
# Define interactive selectors for variant selection.
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
)
# 'variant_selector_agg' is for aggregated selection based on 'site' only.
variant_selector_agg = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
# Initialize an empty list to store charts for each unique selection in 'df'.
empty_chart = []
# Loop through each unique cell selection in the DataFrame.
for cell in list(df["selection"].unique()):
# Filter DataFrame for the current selection.
tmp_df = df[df["selection"] == cell]
# Check if aggregation flag is True to aggregate data.
if flag == True:
# Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
agg_df = (
tmp_df.groupby("site")[["binding_mean", "effect"]].mean().reset_index().round(2)
)
# Create a chart using aggregated data with specified encodings.
chart = (
alt.Chart(agg_df)
.mark_point(stroke="black", filled=True) # Use filled points with black stroke.
.encode(
x=alt.X(
"effect",
title=f"Mean entry in CHO-{cell}",
axis=alt.Axis(grid=False),
),
y=alt.Y(
"binding_mean",
title=f"Mean {cell} binding",
axis=alt.Axis(grid=False),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
opacity=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0.2)
),
size=alt.condition(
variant_selector_agg, alt.value(100), alt.value(50)
),
strokeWidth=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector_agg, alt.value("orange"), alt.value("gray")
),
tooltip=["site", "binding_mean", "effect"],
)
.add_params(variant_selector_agg)
)
# Add the chart to the list.
empty_chart.append(chart)
else:
# Create a chart for individual data points with specified encodings.
chart = (
alt.Chart(tmp_df)
.mark_point(stroke="black", filled=True)
.encode(
x=alt.X(
"effect", title=f"Entry in CHO-{cell}", axis=alt.Axis(grid=False)
),
y=alt.Y(
"binding_mean",
title=f"{cell} binding",
axis=alt.Axis(grid=False),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
opacity=alt.condition(
variant_selector, alt.value(1), alt.value(0.1)
),
size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("gray")
),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean",
"times_seen_binding",
"effect",
],
)
.add_params(variant_selector)
)
# Add the chart to the list.
empty_chart.append(chart)
# Combine all charts horizontally with a title.
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between binding and entry")
)
# Return the combined chart for display or further use.
return combined_chart
# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()
# Save the plot
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)
# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()
Same plot as above, but slightly different format¶
In [14]:
def plot_entry_binding_corr_heatmap(df):
empty_chart = []
for cell in list(df["selection"].unique()):
tmp_df = df[df["selection"] == cell]
chart = (
alt.Chart(tmp_df, title=f"{cell}")
.mark_rect()
.encode(
x=alt.X(
"effect", title="Cell entry", axis=alt.Axis(values=[-2, -1, 0, 1])
).bin(maxbins=50),
y=alt.Y(
"binding_mean",
title="Receptor binding",
axis=alt.Axis(values=[-4, -2, 0, 2]),
).bin(maxbins=50),
color=alt.Color("count()", title="Count").scale(type='log'),
)
).properties(width=200,height=200)
empty_chart.append(chart)
combined_chart = alt.hconcat(
*empty_chart,
).resolve_scale(y="shared", x="shared", color="shared")
return combined_chart
entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
Find correlations between bEFNB2 and bEFNB3 binding¶
In [15]:
def plot_entry_binding_corr(df):
chart = (
alt.Chart(df, title="Correlation Between Mutant Binding Scores")
.mark_rect()
.encode(
x=alt.X(
"binding_mean_E2",
title="bEFNB2 binding",
axis=alt.Axis(values=[-4,-2, 0, 2]),
).bin(maxbins=50),
y=alt.Y(
"binding_mean_E3",
title="bEFNB3 binding",
axis=alt.Axis(values=[-2, 0, 2]),
).bin(maxbins=50),
color=alt.Color("count()", title="Count").scale(type='log'),
)
).properties(width=200,height=200)
return chart
entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
Plot correlations of binding mutants in scatterplots, and color different subsets¶
First, find mutations that are outliers in the correlation between bEFNB2 and bEFNB3 binding¶
In [16]:
def find_outliers(df):
df = df.dropna().copy()
#Calculate the best fit line
slope, intercept = np.polyfit(df['binding_mean_E2'], df['binding_mean_E3'], 1)
#compute residuals
df['predicted_y'] = slope * df['binding_mean_E2'] + intercept
df['residuals'] = df['binding_mean_E3'] - df['predicted_y']
#identify outliers
# calculate the mean and standard deviation of the residuals
mean_residual = np.mean(df['residuals'])
std_residual = np.std(df['residuals'])
outlier_threshold = 4.5 #4.5 std deviations
df['is_outlier'] = abs(df['residuals']) > (std_residual * outlier_threshold)
# Filter the DataFrame to only include outliers
outliers = df[df['is_outlier']]
print(f'Here are the outlier mutations outside of a {outlier_threshold} standard deviation:\n')
display(outliers)
outliers_list = list(outliers['site'].unique())
print(f' Here are the sites: \n {outliers_list}')
return df,outliers_list
residuals_df,outliers_list = find_outliers(df_binding_effect_merge)
Here are the outlier mutations outside of a 4.5 standard deviation:
site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | ... | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | mutant_type_E3 | predicted_y | residuals | is_outlier | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
112 | 79 | N | S | 1.2920 | 1.6520 | 3.000 | -0.63520 | 0.4964 | 2.375 | 1.00 | ... | 0.25950 | 3.0 | -0.12940 | 0.5688 | 3.429 | 1.0 | Hydrophilic | 0.272492 | -0.918892 | True |
566 | 137 | S | D | -0.4362 | 0.8381 | 4.000 | -1.10900 | 0.3349 | 7.000 | 1.00 | ... | 0.83060 | 4.0 | -0.41160 | 0.7602 | 3.571 | 1.0 | Negative | -0.118506 | 0.922206 | True |
620 | 143 | N | Q | 0.4306 | 0.6943 | 5.250 | -0.31900 | 0.3995 | 8.250 | 1.00 | ... | 1.21300 | 4.5 | -0.35540 | 0.7932 | 4.286 | 1.0 | Hydrophilic | 0.077604 | -1.079604 | True |
762 | 161 | S | E | 0.5686 | 0.6514 | 2.500 | -0.96530 | 0.5165 | 5.875 | 1.00 | ... | 0.25090 | 3.0 | -0.14880 | 0.4404 | 3.143 | 1.0 | Negative | 0.108826 | 1.000174 | True |
5271 | 492 | Q | L | -0.2126 | 0.2587 | 5.500 | 0.43260 | 0.2934 | 4.500 | 1.00 | ... | 0.06666 | 5.5 | 0.48330 | 0.1827 | 5.143 | 1.0 | Hydrophobic | -0.067918 | 1.328918 | True |
5274 | 492 | Q | R | -2.8230 | 0.6111 | 19.250 | -0.07662 | 0.2715 | 23.120 | 1.00 | ... | 0.05388 | 20.0 | 0.50950 | 0.1192 | 20.570 | 1.0 | Positive | -0.658510 | 1.302210 | True |
5718 | 530 | Q | F | -4.0150 | 0.4665 | 6.250 | 0.46740 | 0.2023 | 5.375 | 1.00 | ... | 0.06198 | 6.0 | 0.35920 | 0.2738 | 6.286 | 1.0 | Aromatic | -0.928195 | 1.012565 | True |
6065 | 555 | D | K | -3.3210 | 0.6425 | 8.000 | 0.20520 | 0.3225 | 10.250 | 1.00 | ... | 1.22500 | 6.0 | 0.08504 | 0.4221 | 6.714 | 1.0 | Positive | -0.771181 | -1.375819 | True |
6068 | 555 | D | R | -2.4650 | 1.6330 | 3.000 | 0.35590 | 0.1457 | 4.625 | 1.00 | ... | 0.42050 | 4.0 | 0.07591 | 0.5260 | 4.143 | 1.0 | Positive | -0.577514 | -0.932486 | True |
6498 | 588 | I | P | -2.3490 | 0.8405 | 5.250 | -0.25550 | 0.3004 | 4.750 | 1.00 | ... | 0.25530 | 5.5 | -0.61310 | 0.3819 | 5.429 | 1.0 | Special | -0.551269 | 1.734269 | True |
6506 | 589 | R | G | 1.6680 | 0.9018 | 7.333 | -1.38000 | 0.4560 | 11.880 | 0.75 | ... | 0.65910 | 7.0 | -0.96550 | 0.5763 | 8.286 | 1.0 | Special | 0.357561 | 1.648439 | True |
6602 | 598 | P | G | -0.5523 | 0.7852 | 5.000 | -0.86740 | 0.9393 | 4.625 | 1.00 | ... | 1.55900 | 4.5 | 0.23870 | 0.5146 | 5.429 | 1.0 | Special | -0.144773 | 1.298773 | True |
12 rows × 22 columns
Here are the sites: [79, 137, 143, 161, 492, 530, 555, 588, 589, 598]
Find mutations in the top quantile of binding for bEFNB2 and bEFNB3 both¶
In [17]:
def find_top_for_both(df):
quantile_threshold = 0.99
# Calculate the quantiles for both variables
x_quantile = df['binding_mean_E2'].quantile(quantile_threshold)
y_quantile = df['binding_mean_E3'].quantile(quantile_threshold)
# Filter points that are above the quantile threshold
df['meets_threshold'] = (df['binding_mean_E2'] >= x_quantile) & (df['binding_mean_E3'] >= y_quantile)
subset = df.query('meets_threshold == True')
top_mutants = list(subset['site'].unique())
return df, top_mutants
top_residuals_df,top_mutants_list = find_top_for_both(residuals_df)
print(f' The sites with the top binding mutations are : \n {top_mutants_list}')
cleaned_df = top_residuals_df.query('meets_threshold == True')[['wildtype','site','mutant','binding_mean_E2','binding_mean_E3']].round(2)
print(f'Here are the specific mutations:\n')
display(cleaned_df)
The sites with the top binding mutations are : [104, 139, 211, 268, 306, 508, 553, 580, 589] Here are the specific mutations:
wildtype | site | mutant | binding_mean_E2 | binding_mean_E3 | |
---|---|---|---|---|---|
390 | E | 104 | T | 1.93 | 0.65 |
593 | N | 139 | W | 1.93 | 1.00 |
1457 | G | 211 | F | 1.96 | 1.17 |
2044 | T | 268 | W | 1.52 | 0.77 |
2592 | N | 306 | M | 1.49 | 0.66 |
2595 | N | 306 | R | 1.52 | 0.90 |
5437 | Y | 508 | Q | 1.93 | 0.64 |
6040 | S | 553 | W | 1.94 | 0.99 |
6365 | I | 580 | S | 2.15 | 0.82 |
6506 | R | 589 | G | 1.67 | 2.01 |
6515 | R | 589 | V | 1.67 | 0.73 |
Plot correlations between individual mutational effects on binding, and color different subsets of sites¶
In [18]:
def plot_affinity_BLI_mutants(df, highlight_conditions, color, subset=None):
df = df.round(2).copy()
df['site_mutant'] = df['site'].astype(str) + df['mutant'].astype(str)
if subset is not None:
df = df[df['site'].isin(subset)]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["binding_mean_E2"], df["binding_mean_E3"]
)
# make correlation chart
chart = alt.Chart(df #,title=alt.Title("Correlation Between Mutant Binding Scores",subtitle=f'r = {r_value:.2f}'
).mark_point(
color="gray",
size=30,
opacity=0.4,
filled=True
).encode(
x=alt.X("binding_mean_E2", title=("bEFNB2 Binding"),axis=alt.Axis(tickCount=4)),
y=alt.Y("binding_mean_E3", title=("bEFNB3 Binding"),axis=alt.Axis(tickCount=4)),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
"binding_std_E2",
"binding_std_E3",
"times_seen_binding_E2",
"times_seen_binding_E3"
],
)
#make colored circles for specific data
highlight = chart.transform_filter(highlight_conditions).mark_point(
color=color, size=60, opacity=1, filled=True, stroke='black', strokeWidth=1
)
#write text near the orange circles
text_on_point = chart.transform_filter(highlight_conditions).mark_text(
align='center',baseline='top',dy=-20,fontSize=16
).encode(text='site_mutant')
min = int(df["binding_mean_E2"].min())
max = int(df["binding_mean_E3"].max())
text = (
alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=-10, dy=-20)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
chart_and_text = chart + text_on_point + highlight + vline + hline
return chart_and_text.properties(height=300,width=300)
Sites selected for BLI validation¶
In [19]:
# these are the data we want to show in orange circles
highlight_conditions = (
(alt.datum.site == 244) & (alt.datum.mutant == 'W') |
(alt.datum.site == 305) & (alt.datum.mutant == 'W') |
(alt.datum.site == 492) & (alt.datum.mutant == 'L') |
(alt.datum.site == 507) & (alt.datum.mutant == 'I') |
(alt.datum.site == 530) & (alt.datum.mutant == 'F') |
(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
(alt.datum.site == 555) & (alt.datum.mutant == 'Y') |
(alt.datum.site == 559) & (alt.datum.mutant == 'R') |
(alt.datum.site == 588) & (alt.datum.mutant == 'V')
)
E2_E3_corr_BLI_mutants = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions,'#e49444')
E2_E3_corr_BLI_mutants.display()
Sites selected for neutralization validations¶
In [20]:
highlight_conditions_neut = (
(alt.datum.site == 333) & (alt.datum.mutant == 'Q') |
(alt.datum.site == 492) & (alt.datum.mutant == 'R') |
(alt.datum.site == 507) & (alt.datum.mutant == 'I') |
(alt.datum.site == 530) & (alt.datum.mutant == 'F') |
(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
(alt.datum.site == 555) & (alt.datum.mutant == 'K')
)
neut_muts = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_neut,'#5778a4')
neut_muts.display()
Find mutations of interest¶
In [21]:
highlight_conditions_top_sites = (
(alt.datum.site == 580) & (alt.datum.mutant == 'S') |
(alt.datum.site == 211) & (alt.datum.mutant == 'F') |
(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
(alt.datum.site == 589) & (alt.datum.mutant == 'G') |
(alt.datum.site == 306) & (alt.datum.mutant == 'R') |
(alt.datum.site == 492) & (alt.datum.mutant == 'L') |
(alt.datum.site == 492) & (alt.datum.mutant == 'R') |
(alt.datum.site == 588) & (alt.datum.mutant == 'P') |
(alt.datum.site == 530) & (alt.datum.mutant == 'F') |
(alt.datum.site == 546) & (alt.datum.mutant == 'H') |
(alt.datum.site == 555) & (alt.datum.mutant == 'K')
)
E2_E3_corr = plot_affinity_BLI_mutants(residuals_df,highlight_conditions_top_sites,'#af7aa1')
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_corr.save(E2_E3_correlation)
Plot correlations of mutations for individual sites of interest with letters of each mutant plotted¶
In [22]:
def plot_affinity_individual_mutants(df,mutant):
df = df.round(2).copy()
df = df[df['site'] == mutant]
if df.empty:
print('nothing here')
pass
else:
wildtype_site = (df['wildtype'].astype(str) + df['site'].astype(str)).unique()[0]
chart = alt.Chart(df,title=alt.Title(f"{wildtype_site}")
).mark_text(
size=15,
opacity=1,
).encode(
x=alt.X("binding_mean_E2", title=("bEFNB2 Binding"),axis=alt.Axis(tickCount=3),scale=alt.Scale(domain=[-5,2])),
y=alt.Y("binding_mean_E3", title=("bEFNB3 Binding"),axis=alt.Axis(tickCount=3),scale=alt.Scale(domain=[-2.5,2])),
text=alt.Text('mutant'),
color=alt.Color(
'mutant_type_E2',
#legend = None,
title='Amino acid type',
scale=alt.Scale(
domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
final_chart = vline + hline + chart
return final_chart.properties(height=200,width=200)
Make for individual mutations¶
In [23]:
def make_hconcat_amino_acid(site_list):
empty_chart = []
for site in site_list:
tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
if non_nan_in_effect_E2 and non_nan_in_effect_E3:
test_plots = plot_affinity_individual_mutants(tmp_df,site)
empty_chart.append(test_plots)
else:
pass
empty_chart_plot = alt.hconcat(*empty_chart)
return empty_chart_plot
one = make_hconcat_amino_acid([211,306,492])
two = make_hconcat_amino_acid([530,546,553])
three = make_hconcat_amino_acid([555])
final_plot = alt.vconcat(one,two,three).configure_title(anchor='middle')
final_plot.display()
Now make for all contact sites¶
In [24]:
empty_charts = []
for site in config['contact_sites']:
tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
if non_nan_in_effect_E2 and non_nan_in_effect_E3:
contact_plots = plot_affinity_individual_mutants(tmp_df,site)
empty_charts.append(contact_plots)
else:
pass
all_contact_plots = alt.vconcat(*empty_charts).resolve_scale(x='shared', y='shared')
all_contact_plots.display()
Make correlation amino acid letter plots for 580-590 loop¶
In [25]:
def make_hconcat_amino_acid(site_list):
empty_chart = []
for site in site_list:
tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
if non_nan_in_effect_E2 and non_nan_in_effect_E3:
test_plots = plot_affinity_individual_mutants(tmp_df,site)
empty_chart.append(test_plots)
else:
pass
empty_chart_plot = alt.hconcat(*empty_chart)
return empty_chart_plot
one = make_hconcat_amino_acid([580,582,583])
two = make_hconcat_amino_acid([584,586,587])
three = make_hconcat_amino_acid([588,589])
final_plot = alt.vconcat(one,two,three).configure_title(anchor='middle')
final_plot.display()
In [26]:
for site in list(range(580,590)):
tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
if non_nan_in_effect_E2 and non_nan_in_effect_E3:
test_plots = plot_affinity_individual_mutants(tmp_df,site)
test_plots.display()
else:
pass
Plot correlations between each site for mean value¶
In [27]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = (
df.groupby("site")
.agg(
{
"effect_E2": "mean",
"effect_E3": "mean",
"binding_mean_E2": "mean",
"binding_mean_E3": "mean",
"wildtype": "first",
}
)
.reset_index().round(2)
)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
means["binding_mean_E2"], means["binding_mean_E3"]
)
r_value = float(r_value)
chart = (
alt.Chart(
means,
title=alt.Title(
"Correlation between Aggregate Mutant Binding Scores",
subtitle=f"r={r_value:.2f}",
),
)
.mark_point(size=50, opacity=0.5,filled=True,color='gray')
.encode(
x=alt.X(
"binding_mean_E2",
title=("Mean bEFNB2 binding"),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"binding_mean_E3",
title=("Mean bEFNB3 binding"),
axis=alt.Axis(tickCount=3),
),
tooltip=[
"site",
"wildtype",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
)
text = (
alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=0, dy=0)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
chart_and_text = chart.properties(width=200,height=200)
return chart_and_text
E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_site_corr.save(E2_E3_correlation_site)
Make plot showing binding by site¶
In [28]:
def plot_affinity_by_site(df):
# define ranges of different RBP regions
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 603)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
agg_means = [] #store aggregation
# For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = df[df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "binding_mean": row["binding_mean"], "site": row["site"],"selection": row["selection"]}
)
agg_means_df = pd.DataFrame(agg_means).round(2)
# Setup interactivity
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
#make chart
chart = alt.Chart(agg_means_df).mark_bar(stroke='black',size=1.5,binSpacing=0,color='black').encode(
alt.X("site")
.title("Site")
.axis(tickCount=5,labelAngle=-90,grid=True),
#.scale(domain=[70, 602]),
alt.Y('mean(binding_mean)')
.axis(tickCount=3)
.title('Mean binding'),
alt.Row('selection',title=None),
tooltip=['site'],
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
color=alt.Color('region',sort=custom_order,title='Region'),
#strokeColor=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
).properties(height=150, width=800).add_params(variant_selector).resolve_scale(y='independent')
return chart
binding_by_site = plot_affinity_by_site(df_binding_effect_concat)
binding_by_site.display()
Make bar chart showing max binding score for each contact residue¶
In [29]:
def plot_affinity_by_contact_site(df, sites_to_show):
# Filter the DataFrame based on the sites to show
contact_df = df[df["site"].isin(sites_to_show)]
# Define a selection for highlighting bars on hover
variant_selector = alt.selection_point(on="mouseover", nearest=True, empty=False, fields=["site"])
# Create the chart
chart = alt.Chart(contact_df).mark_bar(stroke='black').encode(
y=alt.Y("site:N", title="Site",sort='-x'),
x=alt.X("max(binding_mean):Q", title="Max binding mutant at site"),
color=alt.condition(variant_selector, alt.value("orange"), alt.value("black")),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
column=alt.Column('selection',title=None)
).add_params(variant_selector).properties(width=200, height=alt.Step(12)).resolve_scale(x='independent',y='shared').configure_header(
labelFontSize=20,
labelAngle=0,
labelAlign='center',
labelAnchor='middle',
labelFont='Helvetica Light',
labelFontStyle='bold',
labelPadding=3
)
return chart
In [30]:
contact_binding_by_site = plot_affinity_by_contact_site(df_binding_effect_concat, config["contact_sites"])
contact_binding_by_site.display()
Make bubble plots for binding in different areas of receptor pocket¶
In [31]:
def make_bubble_binding_region(df):
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 603)),
"Receptor Contact": config["contact_sites"],
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact"]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = df[df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"site": row["site"],
"mutant": row["mutant"],
"selection": row["selection"],
}
)
agg_means_df = pd.DataFrame(agg_means)
variant_selector = alt.selection_point(
on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
)
chart = alt.Chart(agg_means_df).mark_point(stroke="black",filled=True).encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_mean",
title="Binding",
axis=alt.Axis(tickCount=4),
),
xOffset="random:Q",
tooltip=["region", "binding_mean", "site", "mutant"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
size=alt.condition(variant_selector, alt.value(70), alt.value(20)),
column=alt.Column('selection',title=None)
).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
width=300,
height=300
).add_params(variant_selector
).resolve_scale(y='independent'
).configure_header(
labelFontSize=20,
labelAngle=0,
labelAlign='center',
labelAnchor='middle',
labelFont='Helvetica Light',
labelFontStyle='bold',
labelPadding=1
)
return chart
entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
Plot effects of selected mutants on cell entry¶
In [32]:
def plot_effects_of_mutants(df):
tmp_df = df.copy()
tmp_df = tmp_df[
((tmp_df['site'] == 598) & (tmp_df['mutant'] == 'G')) |
((tmp_df['site'] == 553) & (tmp_df['mutant'] == 'W')) |
((tmp_df['site'] == 211) & (tmp_df['mutant'] == 'F')) |
((tmp_df['site'] == 546) & (tmp_df['mutant'] == 'H')) |
((tmp_df['site'] == 143) & (tmp_df['mutant'] == 'Q')) |
((tmp_df['site'] == 331) & (tmp_df['mutant'] == 'E')) |
((tmp_df['site'] == 566) & (tmp_df['mutant'] == 'C'))
]
tmp_df['label'] = (tmp_df['wildtype'].astype(str) + tmp_df['site'].astype(str) + tmp_df['mutant'].astype(str))
tmp_df = tmp_df.sort_values(by='site')
binding_chart = alt.Chart(tmp_df).mark_bar().encode(
alt.X("label:N",title='Mutant',sort=None),
alt.Y("binding_mean:Q",title='Binding score',axis=alt.Axis(tickCount=4)),
xOffset="selection:N",
color=alt.Color("selection:N",title='Receptor selection'),
)
binding_chart.resolve_scale(y='independent',x='shared').display()
plot_effects_of_mutants(df_binding_effect_concat)
In [ ]: