ephrin_binding.ipynb¶

Analyze effect of RBP mutations on receptor binding from DMS selection data using soluble bat Ephrin-B2 or -B3

  • Written by Brendan Larsen

Papermill parameterization¶

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
nipah_config = None

# input files
binding_E2_file = None
binding_E3_file = None

# output images
entry_binding_combined_corr_plot = None
E2_E3_correlation = None
E2_E3_correlation_site = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
    "results/images/entry_binding_combined_corr_plot.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"

Import libraries, setup working directory, and import altair themes¶

In [3]:
import os
import math
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')

Set hard paths for running in interactive mode¶

In [4]:
if nipah_config is None:
    print("loading hard paths")
    nipah_config = "nipah_config.yaml"

    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"

Import config variables¶

In [5]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import and merge data¶

Import the filtered binding and entry data for bEFNB2 and bEFNB3¶

In [6]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
df_E3_filter = pd.read_csv(binding_E3_file)

Merge data¶

In [7]:
## Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=["site", "wildtype", "mutant"],
    suffixes=["_E2", "_E3"],
    how="outer",
)
display(df_binding_effect_merge.head(3))
## Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "bEFNB2"
df_E3_filter["selection"] = "bEFNB3"

## Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
0 71 Q D -0.7817 1.6460 3.25 -1.164 0.8890 4.500 1.0 Negative -0.29370 0.3305 4.0 -0.3884 0.6369 3.429 1.0 Negative
1 71 Q E 0.1659 0.4309 3.50 -1.255 0.3123 5.375 1.0 Negative -0.35270 0.1762 4.0 -0.2482 0.9791 4.571 1.0 Negative
2 71 Q F -0.3429 0.5299 3.00 -1.058 0.6637 4.625 1.0 Aromatic -0.03982 0.1283 3.0 -0.4973 0.3080 3.286 1.0 Aromatic

Calculate stats¶

In [8]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
    df = df.round(2)
    print(f"We are analyzing {name}\n")
    print(f"The total number of mutants was: {df.shape[0]}\n")
    tmp_df = df.sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected:")
    display(tmp_df.head(5))

    tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
    print("\nThese are the highest binding mutants detected:\n")
    display(tmp_df_high.head(5))

    # What about mutants with positive entry scores?
    tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected with positive entry scores:")
    display(tmp_df.head(5))

    tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
    print(
        "\nThese are the highest binding mutants detected with positive entry scores:\n"
    )
    display(tmp_df_high.head(5))

    mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
    print('These are the sites with the highest summed binding score:\n')
    display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))


find_highest_lowest(df_E2_filter, "bEFNB2")
find_highest_lowest(df_E3_filter, "bEFNB3")
We are analyzing bEFNB2

The total number of mutants was: 6672

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5369 501 E K -5.28 0.93 24.50 -0.64 0.61 32.00 1.0 Positive bEFNB2
5744 532 A V -5.26 1.14 16.25 -0.32 0.31 17.88 1.0 Hydrophobic bEFNB2
1770 236 R H -5.05 0.88 22.00 -1.25 0.34 29.38 1.0 Positive bEFNB2
5240 490 Q K -4.97 1.08 8.50 -0.69 0.54 11.38 1.0 Positive bEFNB2
5238 490 Q H -4.75 0.86 8.25 -0.45 0.67 10.88 1.0 Positive bEFNB2
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6365 580 I S 2.15 1.77 4.75 -1.00 0.69 6.38 1.0 Hydrophilic bEFNB2
2969 331 N E 2.14 1.23 3.00 -0.89 0.84 3.50 1.0 Negative bEFNB2
6468 586 N T 1.96 0.58 5.25 -0.82 0.82 7.00 1.0 Hydrophilic bEFNB2
1457 211 G F 1.96 0.55 4.75 -0.67 0.43 5.12 1.0 Aromatic bEFNB2
6040 553 S W 1.94 0.29 10.00 -0.30 0.43 13.25 1.0 Aromatic bEFNB2
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6102 558 A V -4.65 0.73 17.25 0.12 0.36 18.50 1.0 Hydrophobic bEFNB2
6089 557 N D -4.63 0.40 9.50 0.38 0.48 11.12 1.0 Negative bEFNB2
5654 526 L H -4.55 0.78 5.25 0.11 0.39 6.75 1.0 Positive bEFNB2
6383 581 Y W -4.52 1.01 10.75 0.09 0.41 11.50 1.0 Aromatic bEFNB2
5252 491 S H -4.50 0.94 8.50 0.11 0.25 10.25 1.0 Positive bEFNB2
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
2264 282 C S 1.61 1.06 4.50 0.34 0.33 3.71 0.5 Hydrophilic bEFNB2
4727 450 Q I 1.46 1.19 5.00 0.14 0.40 6.00 1.0 Hydrophobic bEFNB2
787 164 N S 1.30 1.14 3.75 0.24 0.29 6.12 1.0 Hydrophilic bEFNB2
6173 564 N K 1.26 0.49 7.25 0.15 0.44 8.00 1.0 Positive bEFNB2
3633 376 K F 1.21 1.78 3.25 0.17 0.27 2.62 1.0 Aromatic bEFNB2
These are the sites with the highest summed binding score:

site binding_mean
248 331 15.73
471 564 15.10
95 172 13.87
491 586 13.38
223 306 12.34
101 178 9.28
149 227 8.72
237 320 8.57
199 282 8.46
90 167 8.39
We are analyzing bEFNB3

The total number of mutants was: 6503

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6013 555 D K -2.15 1.23 6.0 0.09 0.42 6.71 1.0 Positive bEFNB3
6016 555 D R -1.51 0.42 4.0 0.08 0.53 4.14 1.0 Positive bEFNB3
5699 530 Q L -1.45 0.48 3.5 -1.02 0.86 5.86 1.0 Hydrophobic bEFNB3
6301 583 T R -1.41 0.45 5.5 -1.00 0.49 6.29 1.0 Positive bEFNB3
6298 583 T K -1.25 0.11 5.0 -0.73 0.64 7.00 1.0 Positive bEFNB3
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6347 589 R G 2.01 0.66 7.0 -0.97 0.58 8.29 1.0 Special bEFNB3
6267 580 I M 1.34 0.53 5.5 -0.82 0.53 6.71 1.0 Hydrophobic bEFNB3
2298 270 V Q 1.33 1.03 5.0 -0.88 0.65 5.43 1.0 Hydrophilic bEFNB3
2251 267 M Q 1.29 1.37 4.5 -0.80 0.82 5.67 1.0 Hydrophilic bEFNB3
5352 492 Q L 1.26 0.07 5.5 0.48 0.18 5.14 1.0 Hydrophobic bEFNB3
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6013 555 D K -2.15 1.23 6.0 0.09 0.42 6.71 1.0 Positive bEFNB3
6016 555 D R -1.51 0.42 4.0 0.08 0.53 4.14 1.0 Positive bEFNB3
5702 530 Q W -1.00 0.55 3.0 0.02 0.66 3.29 1.0 Aromatic bEFNB3
6007 555 D A -0.83 0.20 3.5 0.38 0.23 3.71 1.0 Hydrophobic bEFNB3
4751 441 P V -0.75 0.18 5.5 0.37 0.09 5.29 1.0 Hydrophobic bEFNB3
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5352 492 Q L 1.26 0.07 5.5 0.48 0.18 5.14 1.0 Hydrophobic bEFNB3
1665 211 G F 1.17 0.29 5.0 0.16 0.57 5.71 1.0 Aromatic bEFNB3
6429 598 P G 1.15 1.56 4.5 0.24 0.51 5.43 1.0 Special bEFNB3
5987 553 S W 0.99 0.44 9.5 0.17 0.41 9.86 1.0 Aromatic bEFNB3
6290 582 D W 0.98 0.34 6.5 0.06 0.31 7.14 1.0 Aromatic bEFNB3
These are the sites with the highest summed binding score:

site binding_mean
489 589 11.70
86 161 7.33
468 564 6.73
224 306 6.50
59 132 6.17
241 323 6.10
469 566 5.97
260 342 5.72
128 204 5.64
174 254 5.14
In [9]:
def overall_stats(df, effect, name):
    # Now group sites and find sites where all mutants are deleterious
    filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
    # Which sites are these?
    unique = filtered_df["site"].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)

    # Find the common elements that are also contact sites
    unique_contact_bool = unique_series.isin(config["contact_sites"])
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    print(f"The dataset we are analyzing is: {name}\n")

    # Print the common elements
    print(
        f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
    )

    print(f"There are {len(unique)} sites with all negative binding score mutants\n")
    print(
        f"These are the sites with all negative binding score mutants: {list(unique)}\n"
    )

    # Now find sites with low and high binding (median)
    median_df = (
        df.groupby("site")["binding_mean"]
        .max()
        .reset_index()
        .sort_values(by="binding_mean", ascending=False)
    )
    print("These are the sites with the highest binding mutants:\n")
    display(median_df.head(5))

    # Now calculate mutant number
    total_mutants = df.shape[0]

    mutants_above_cutoff_tolerated = df[df["effect"] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
        ["site", "effect", "binding_mean", "wildtype", "mutant"]
    ]

    total_sites = df["site"].unique().shape[0]

    print(f"The total number of sites are: {total_sites}")


overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2

Here are the contact sites that only have negative binding scores: [239, 242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588]

There are 43 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 239, 242, 243, 248, 346, 351, 352, 389, 390, 399, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590]

These are the sites with the highest binding mutants:

site binding_mean
485 580 2.152
248 331 2.144
491 586 1.962
134 211 1.957
460 553 1.945
The total number of sites are: 508
The dataset we are analyzing is: EFNB3

Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532]

There are 16 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [108, 140, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584]

These are the sites with the highest binding mutants:

site binding_mean
489 589 2.006
481 580 1.337
188 270 1.329
185 267 1.291
404 492 1.261
The total number of sites are: 503

Find sites with opposite effects on binding¶

In [10]:
# find sites that are different
def find_biggest_differences(df):
    df = df.round(2)
    efnb2_good_efnb3_bad = df[
        (df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
    ].sort_values(by="binding_mean_E2", ascending=False)
    print('mutantions good for efnb2 binding, bad for efnb3 binding:\n')
    display(efnb2_good_efnb3_bad)

    efnb2_bad_efnb3_good = df[
        (df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
    ].sort_values(by="binding_mean_E3", ascending=False)
    print('mutantions bad for efnb2 binding, good for efnb3 binding:\n')
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_binding_effect_merge)
mutantions good for efnb2 binding, bad for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
5924 546 L H 1.52 1.14 5.25 -1.15 0.39 6.75 1.0 Positive -0.54 0.67 3.5 -0.85 0.54 5.14 1.0 Positive
2543 303 P C 1.34 1.58 5.00 -0.72 0.59 5.00 1.0 Special -0.57 0.10 4.0 -0.70 0.59 4.86 1.0 Special
112 79 N S 1.29 1.65 3.00 -0.64 0.50 2.38 1.0 Hydrophilic -0.65 0.26 3.0 -0.13 0.57 3.43 1.0 Hydrophilic
93 78 D I 0.66 0.63 5.25 -0.93 0.40 6.50 1.0 Hydrophobic -0.57 0.76 5.0 -0.52 0.84 5.14 1.0 Hydrophobic
1637 224 M Q 0.65 0.88 3.50 0.17 0.31 2.75 1.0 Hydrophilic -0.58 0.38 3.5 0.21 0.29 4.29 1.0 Hydrophilic
5591 520 I G 0.59 1.01 5.00 -1.46 0.35 8.00 1.0 Special -0.53 0.15 5.5 -1.31 0.32 7.00 1.0 Special
996 178 V T 0.53 0.73 3.75 -0.53 0.76 4.25 1.0 Hydrophilic -0.51 0.60 3.0 -0.02 0.43 3.57 1.0 Hydrophilic
mutantions bad for efnb2 binding, good for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
6498 588 I P -2.35 0.84 5.25 -0.26 0.30 4.75 1.0 Special 1.18 0.26 5.5 -0.61 0.38 5.43 1.0 Special
6602 598 P G -0.55 0.79 5.00 -0.87 0.94 4.62 1.0 Special 1.15 1.56 4.5 0.24 0.51 5.43 1.0 Special
3093 338 R G -0.52 0.51 4.50 0.30 0.42 5.25 1.0 Special 0.68 0.98 4.0 0.16 0.34 3.43 1.0 Special
5248 491 S A -0.93 0.54 5.50 -0.67 0.81 6.25 1.0 Hydrophobic 0.65 0.01 6.5 0.14 0.36 6.14 1.0 Hydrophobic
5274 492 Q R -2.82 0.61 19.25 -0.08 0.27 23.12 1.0 Positive 0.64 0.05 20.0 0.51 0.12 20.57 1.0 Positive

Find the top overlapping binders for both bEFNB2 and bEFNB3¶

In [11]:
def find_good_binding_for_both(df):
    e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
    e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
    e2_good = e2_good.head(50)
    e3_good = e3_good.head(50)
    combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
    display(combo)

find_good_binding_for_both(df_binding_effect_merge)
site wildtype mutant binding_mean_E2_x binding_std_E2_x times_seen_binding_E2_x effect_E2_x effect_std_E2_x times_seen_cell_entry_E2_x frac_models_E2_x ... frac_models_E2_y mutant_type_E2_y binding_mean_E3_y binding_std_E3_y times_seen_binding_E3_y effect_E3_y effect_std_E3_y times_seen_cell_entry_E3_y frac_models_E3_y mutant_type_E3_y
0 580 I S 2.152 1.7680 4.750 -1.0010 0.6921 6.375 1.00 ... 1.00 Hydrophilic 0.8187 0.030890 6.0 0.1163 0.3235 5.571 1.0 Hydrophilic
1 211 G F 1.957 0.5488 4.750 -0.6654 0.4262 5.125 1.00 ... 1.00 Aromatic 1.1690 0.289000 5.0 0.1637 0.5716 5.714 1.0 Aromatic
2 553 S W 1.945 0.2867 10.000 -0.2967 0.4293 13.250 1.00 ... 1.00 Aromatic 0.9943 0.435200 9.5 0.1669 0.4060 9.857 1.0 Aromatic
3 139 N W 1.930 0.4834 5.750 -0.6672 0.6217 8.000 1.00 ... 1.00 Aromatic 1.0030 0.004575 4.5 -0.4441 0.4452 6.286 1.0 Aromatic
4 589 R G 1.668 0.9018 7.333 -1.3800 0.4560 11.880 0.75 ... 0.75 Special 2.0060 0.659100 7.0 -0.9655 0.5763 8.286 1.0 Special
5 306 N R 1.523 1.8470 3.750 -1.2790 0.4835 5.625 1.00 ... 1.00 Positive 0.9003 0.713800 4.5 -0.8509 0.5830 6.286 1.0 Positive

6 rows × 35 columns

Find sites with the largest absolute difference in binding¶

In [12]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
    df = df.round(2)
    df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
    print(
        "These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
    )
    display(df.sort_values(by="binding_diff", ascending=False).head(10))

    # calculate aggregate differences
    agg_df = (
        df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
        .mean()
        .reset_index()
    )
    agg_df = agg_df.round(2)
    print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
    display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))


find_highest_lowest(df_binding_effect_merge)
These are the mutants with the biggest difference between EFNB2 and EFNB3:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff
5718 530 Q F -4.01 0.47 6.25 0.47 0.20 5.38 1.0 Aromatic 0.08 0.06 6.0 0.36 0.27 6.29 1.0 Aromatic 4.09
5247 490 Q Y -4.29 0.90 5.50 -1.09 0.40 6.38 1.0 Aromatic -0.46 0.57 5.5 -0.31 0.80 5.71 1.0 Aromatic 3.83
5252 491 S H -4.50 0.94 8.50 0.11 0.25 10.25 1.0 Positive -0.87 0.79 7.0 -1.38 0.49 8.29 1.0 Positive 3.63
5251 491 S G -4.18 0.59 6.50 0.13 0.30 8.62 1.0 Special -0.56 0.05 5.5 -0.24 0.61 6.29 1.0 Special 3.62
5246 490 Q W -4.20 1.00 3.75 0.23 0.38 4.75 1.0 Aromatic -0.59 0.32 3.0 -0.15 0.46 3.29 1.0 Aromatic 3.61
6498 588 I P -2.35 0.84 5.25 -0.26 0.30 4.75 1.0 Special 1.18 0.26 5.5 -0.61 0.38 5.43 1.0 Special 3.53
5270 492 Q K -3.52 0.55 11.00 0.35 0.23 11.62 1.0 Positive 0.01 0.08 10.0 0.33 0.24 11.57 1.0 Positive 3.53
5274 492 Q R -2.82 0.61 19.25 -0.08 0.27 23.12 1.0 Positive 0.64 0.05 20.0 0.51 0.12 20.57 1.0 Positive 3.46
1814 239 S H -3.44 0.09 5.25 0.33 0.27 4.50 1.0 Positive -0.04 0.83 4.0 -1.22 0.69 5.00 1.0 Positive 3.40
5727 530 Q V -4.33 0.84 6.00 0.11 0.27 8.62 1.0 Hydrophobic -0.99 0.90 5.5 -1.44 0.74 7.00 1.0 Hydrophobic 3.34
These are the sites with the biggest difference between EFNB2 and EFNB3:

site binding_mean_E2 binding_mean_E3 binding_diff
415 505 -3.79 -0.77 2.90
439 530 -3.63 -0.56 2.80
405 491 -3.58 -0.36 2.78
495 588 -3.50 0.04 2.69
404 490 -3.81 -0.37 2.50

Make plots¶

Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶

In [13]:
def plot_corr_binding_entry_updated(df, flag):
    df = df.copy().round(2)
    # Define interactive selectors for variant selection.
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
    )
    # 'variant_selector_agg' is for aggregated selection based on 'site' only.
    variant_selector_agg = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )

    # Initialize an empty list to store charts for each unique selection in 'df'.
    empty_chart = []
    
    # Loop through each unique cell selection in the DataFrame.
    for cell in list(df["selection"].unique()):
        # Filter DataFrame for the current selection.
        tmp_df = df[df["selection"] == cell]
        
        # Check if aggregation flag is True to aggregate data.
        if flag == True:
            # Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
            agg_df = (
                tmp_df.groupby("site")[["binding_mean", "effect"]].mean().reset_index().round(2)
            )
            # Create a chart using aggregated data with specified encodings.
            chart = (
                alt.Chart(agg_df)
                .mark_point(stroke="black", filled=True)  # Use filled points with black stroke.
                .encode(
                    x=alt.X(
                        "effect",
                        title=f"Mean entry in CHO-{cell}",
                        axis=alt.Axis(grid=False),
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"Mean {cell} binding",
                        axis=alt.Axis(grid=False),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
                    opacity=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0.2)
                    ),
                    size=alt.condition(
                        variant_selector_agg, alt.value(100), alt.value(50)
                    ),
                    strokeWidth=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector_agg, alt.value("orange"), alt.value("gray")
                    ),
                    tooltip=["site", "binding_mean", "effect"],
                )
                .add_params(variant_selector_agg)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

        else:
            # Create a chart for individual data points with specified encodings.
            chart = (
                alt.Chart(tmp_df)
                .mark_point(stroke="black", filled=True)
                .encode(
                    x=alt.X(
                        "effect", title=f"Entry in CHO-{cell}", axis=alt.Axis(grid=False)
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"{cell} binding",
                        axis=alt.Axis(grid=False),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
                    opacity=alt.condition(
                        variant_selector, alt.value(1), alt.value(0.1)
                    ),
                    size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
                    strokeWidth=alt.condition(
                        variant_selector, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector, alt.value("orange"), alt.value("gray")
                    ),
                    tooltip=[
                        "site",
                        "wildtype",
                        "mutant",
                        "binding_mean",
                        "times_seen_binding",
                        "effect",
                    ],
                )
                .add_params(variant_selector)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

    # Combine all charts horizontally with a title.
    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    )
    
    # Return the combined chart for display or further use.
    return combined_chart

# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()

# Save the plot 
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot.save(entry_binding_combined_corr_plot)

# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()

Same plot as above, but slightly different format¶

In [14]:
def plot_entry_binding_corr_heatmap(df):
    empty_chart = []
    for cell in list(df["selection"].unique()):
        tmp_df = df[df["selection"] == cell]
        chart = (
            alt.Chart(tmp_df, title=f"{cell}")
            .mark_rect()
            .encode(
                x=alt.X(
                    "effect", title="Cell entry", axis=alt.Axis(values=[-2, -1, 0, 1])
                ).bin(maxbins=50),
                y=alt.Y(
                    "binding_mean",
                    title="Receptor binding",
                    axis=alt.Axis(values=[-4, -2, 0, 2]),
                ).bin(maxbins=50),
                color=alt.Color("count()", title="Count").scale(type='log'),
            )
        ).properties(width=200,height=200)
        empty_chart.append(chart)

    combined_chart = alt.hconcat(
        *empty_chart, 
    ).resolve_scale(y="shared", x="shared", color="shared")
    return combined_chart


entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()

Find correlations between bEFNB2 and bEFNB3 binding¶

In [15]:
def plot_entry_binding_corr(df):
    chart = (
        alt.Chart(df, title="Correlation Between Mutant Binding Scores")
        .mark_rect()
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title="bEFNB2 binding",
                axis=alt.Axis(values=[-4,-2, 0, 2]),
            ).bin(maxbins=50),
            y=alt.Y(
                "binding_mean_E3",
                title="bEFNB3 binding",
                axis=alt.Axis(values=[-2, 0, 2]),
            ).bin(maxbins=50),
            color=alt.Color("count()", title="Count").scale(type='log'),
        )
    ).properties(width=200,height=200)
    return chart


entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()

Plot correlations of binding mutants in scatterplots, and color different subsets¶

First, find mutations that are outliers in the correlation between bEFNB2 and bEFNB3 binding¶

In [16]:
def find_outliers(df):
    df = df.dropna().copy()
    #Calculate the best fit line
    slope, intercept = np.polyfit(df['binding_mean_E2'], df['binding_mean_E3'], 1)
    
    #compute residuals
    df['predicted_y'] = slope * df['binding_mean_E2'] + intercept
    df['residuals'] = df['binding_mean_E3'] - df['predicted_y']
    
    #identify outliers
    # calculate the mean and standard deviation of the residuals
    mean_residual = np.mean(df['residuals'])
    std_residual = np.std(df['residuals'])

    outlier_threshold = 4.5 #4.5 std deviations
    df['is_outlier'] = abs(df['residuals']) > (std_residual * outlier_threshold)
    
    # Filter the DataFrame to only include outliers
    outliers = df[df['is_outlier']]
    print(f'Here are the outlier mutations outside of a {outlier_threshold} standard deviation:\n')
    display(outliers)
    outliers_list = list(outliers['site'].unique())
    print(f' Here are the sites: \n {outliers_list}')
    return df,outliers_list

residuals_df,outliers_list = find_outliers(df_binding_effect_merge)
Here are the outlier mutations outside of a 4.5 standard deviation:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 ... binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 predicted_y residuals is_outlier
112 79 N S 1.2920 1.6520 3.000 -0.63520 0.4964 2.375 1.00 ... 0.25950 3.0 -0.12940 0.5688 3.429 1.0 Hydrophilic 0.272492 -0.918892 True
566 137 S D -0.4362 0.8381 4.000 -1.10900 0.3349 7.000 1.00 ... 0.83060 4.0 -0.41160 0.7602 3.571 1.0 Negative -0.118506 0.922206 True
620 143 N Q 0.4306 0.6943 5.250 -0.31900 0.3995 8.250 1.00 ... 1.21300 4.5 -0.35540 0.7932 4.286 1.0 Hydrophilic 0.077604 -1.079604 True
762 161 S E 0.5686 0.6514 2.500 -0.96530 0.5165 5.875 1.00 ... 0.25090 3.0 -0.14880 0.4404 3.143 1.0 Negative 0.108826 1.000174 True
5271 492 Q L -0.2126 0.2587 5.500 0.43260 0.2934 4.500 1.00 ... 0.06666 5.5 0.48330 0.1827 5.143 1.0 Hydrophobic -0.067918 1.328918 True
5274 492 Q R -2.8230 0.6111 19.250 -0.07662 0.2715 23.120 1.00 ... 0.05388 20.0 0.50950 0.1192 20.570 1.0 Positive -0.658510 1.302210 True
5718 530 Q F -4.0150 0.4665 6.250 0.46740 0.2023 5.375 1.00 ... 0.06198 6.0 0.35920 0.2738 6.286 1.0 Aromatic -0.928195 1.012565 True
6065 555 D K -3.3210 0.6425 8.000 0.20520 0.3225 10.250 1.00 ... 1.22500 6.0 0.08504 0.4221 6.714 1.0 Positive -0.771181 -1.375819 True
6068 555 D R -2.4650 1.6330 3.000 0.35590 0.1457 4.625 1.00 ... 0.42050 4.0 0.07591 0.5260 4.143 1.0 Positive -0.577514 -0.932486 True
6498 588 I P -2.3490 0.8405 5.250 -0.25550 0.3004 4.750 1.00 ... 0.25530 5.5 -0.61310 0.3819 5.429 1.0 Special -0.551269 1.734269 True
6506 589 R G 1.6680 0.9018 7.333 -1.38000 0.4560 11.880 0.75 ... 0.65910 7.0 -0.96550 0.5763 8.286 1.0 Special 0.357561 1.648439 True
6602 598 P G -0.5523 0.7852 5.000 -0.86740 0.9393 4.625 1.00 ... 1.55900 4.5 0.23870 0.5146 5.429 1.0 Special -0.144773 1.298773 True

12 rows × 22 columns

 Here are the sites: 
 [79, 137, 143, 161, 492, 530, 555, 588, 589, 598]

Find mutations in the top quantile of binding for bEFNB2 and bEFNB3 both¶

In [17]:
def find_top_for_both(df):
    quantile_threshold = 0.99
    # Calculate the quantiles for both variables
    x_quantile = df['binding_mean_E2'].quantile(quantile_threshold)
    y_quantile = df['binding_mean_E3'].quantile(quantile_threshold)
    
    # Filter points that are above the quantile threshold
    df['meets_threshold'] = (df['binding_mean_E2'] >= x_quantile) & (df['binding_mean_E3'] >= y_quantile)
    subset = df.query('meets_threshold == True')
    top_mutants = list(subset['site'].unique())
    return df, top_mutants

top_residuals_df,top_mutants_list = find_top_for_both(residuals_df)
print(f' The sites with the top binding mutations are : \n {top_mutants_list}')
cleaned_df = top_residuals_df.query('meets_threshold == True')[['wildtype','site','mutant','binding_mean_E2','binding_mean_E3']].round(2)
print(f'Here are the specific mutations:\n')
display(cleaned_df)
 The sites with the top binding mutations are : 
 [104, 139, 211, 268, 306, 508, 553, 580, 589]
Here are the specific mutations:

wildtype site mutant binding_mean_E2 binding_mean_E3
390 E 104 T 1.93 0.65
593 N 139 W 1.93 1.00
1457 G 211 F 1.96 1.17
2044 T 268 W 1.52 0.77
2592 N 306 M 1.49 0.66
2595 N 306 R 1.52 0.90
5437 Y 508 Q 1.93 0.64
6040 S 553 W 1.94 0.99
6365 I 580 S 2.15 0.82
6506 R 589 G 1.67 2.01
6515 R 589 V 1.67 0.73

Plot correlations between individual mutational effects on binding, and color different subsets of sites¶

In [18]:
def plot_affinity_BLI_mutants(df, highlight_conditions, color, subset=None):
    df = df.round(2).copy()
    df['site_mutant'] = df['site'].astype(str) + df['mutant'].astype(str)

    if subset is not None:
        df = df[df['site'].isin(subset)]

    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["binding_mean_E2"], df["binding_mean_E3"]
    )
    
    # make correlation chart
    chart = alt.Chart(df #,title=alt.Title("Correlation Between Mutant Binding Scores",subtitle=f'r = {r_value:.2f}'
    ).mark_point(
        color="gray", 
        size=30, 
        opacity=0.4, 
        filled=True
    ).encode(
        x=alt.X("binding_mean_E2", title=("bEFNB2 Binding"),axis=alt.Axis(tickCount=4)),
        y=alt.Y("binding_mean_E3", title=("bEFNB3 Binding"),axis=alt.Axis(tickCount=4)),
        tooltip=[
            "site",
            "wildtype",
            "mutant",
            "binding_mean_E2",
            "binding_mean_E3",
            "effect_E2",
            "effect_E3",
            "binding_std_E2",
            "binding_std_E3",
            "times_seen_binding_E2",
            "times_seen_binding_E3"
        ],
    )

    #make colored circles for specific data
    highlight = chart.transform_filter(highlight_conditions).mark_point(
        color=color, size=60, opacity=1, filled=True, stroke='black', strokeWidth=1
    )
    #write text near the orange circles
    text_on_point = chart.transform_filter(highlight_conditions).mark_text(
            align='center',baseline='top',dy=-20,fontSize=16
    ).encode(text='site_mutant')
    
    min = int(df["binding_mean_E2"].min())
    max = int(df["binding_mean_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=-10, dy=-20)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
    chart_and_text = chart + text_on_point + highlight + vline + hline
    return chart_and_text.properties(height=300,width=300)

Sites selected for BLI validation¶

In [19]:
# these are the data we want to show in orange circles
highlight_conditions = (
        (alt.datum.site == 244) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 305) & (alt.datum.mutant == 'W') | 
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'Y') |
        (alt.datum.site == 559) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'V') 
)

E2_E3_corr_BLI_mutants = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions,'#e49444')
E2_E3_corr_BLI_mutants.display()

Sites selected for neutralization validations¶

In [20]:
highlight_conditions_neut = (
        (alt.datum.site == 333) & (alt.datum.mutant == 'Q') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') | 
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'K') 
)
neut_muts = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_neut,'#5778a4')
neut_muts.display()

Find mutations of interest¶

In [21]:
highlight_conditions_top_sites = (
        (alt.datum.site == 580) & (alt.datum.mutant == 'S') |
        (alt.datum.site == 211) & (alt.datum.mutant == 'F') | 
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 589) & (alt.datum.mutant == 'G') |
        (alt.datum.site == 306) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'P') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 546) & (alt.datum.mutant == 'H') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'K') 


)

E2_E3_corr = plot_affinity_BLI_mutants(residuals_df,highlight_conditions_top_sites,'#af7aa1')
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_corr.save(E2_E3_correlation)

Plot correlations of mutations for individual sites of interest with letters of each mutant plotted¶

In [22]:
def plot_affinity_individual_mutants(df,mutant):
    df = df.round(2).copy()
    df = df[df['site'] == mutant]
    if df.empty:
        print('nothing here')
        pass
    else:
        wildtype_site = (df['wildtype'].astype(str) + df['site'].astype(str)).unique()[0]

        chart = alt.Chart(df,title=alt.Title(f"{wildtype_site}")
        ).mark_text(
            size=15,
            opacity=1,
        ).encode(
            x=alt.X("binding_mean_E2", title=("bEFNB2 Binding"),axis=alt.Axis(tickCount=3),scale=alt.Scale(domain=[-5,2])),
            y=alt.Y("binding_mean_E3", title=("bEFNB3 Binding"),axis=alt.Axis(tickCount=3),scale=alt.Scale(domain=[-2.5,2])),
            text=alt.Text('mutant'),
            color=alt.Color(
                'mutant_type_E2',
                #legend = None, 
                title='Amino acid type',
                scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=[
                "site",
                "wildtype",
                "mutant",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
        # Vertical line at x=0
        vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
        # Horizontal line at y=0
        hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
        final_chart = vline + hline + chart
        return final_chart.properties(height=200,width=200)

Make for individual mutations¶

In [23]:
def make_hconcat_amino_acid(site_list):
    empty_chart = []
    for site in site_list:
        tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
        non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
        non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
    
        if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
            test_plots = plot_affinity_individual_mutants(tmp_df,site)
            empty_chart.append(test_plots)
        else:
            pass
    empty_chart_plot = alt.hconcat(*empty_chart)
    return empty_chart_plot

one = make_hconcat_amino_acid([211,306,492])
two = make_hconcat_amino_acid([530,546,553])
three = make_hconcat_amino_acid([555])

final_plot = alt.vconcat(one,two,three).configure_title(anchor='middle')
final_plot.display()

Now make for all contact sites¶

In [24]:
empty_charts = []
for site in config['contact_sites']:
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        contact_plots = plot_affinity_individual_mutants(tmp_df,site)
        empty_charts.append(contact_plots)
    else:
        pass

all_contact_plots = alt.vconcat(*empty_charts).resolve_scale(x='shared', y='shared')
all_contact_plots.display()

Make correlation amino acid letter plots for 580-590 loop¶

In [25]:
def make_hconcat_amino_acid(site_list):
    empty_chart = []
    for site in site_list:
        tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
        non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
        non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()
    
        if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
            test_plots = plot_affinity_individual_mutants(tmp_df,site)
            empty_chart.append(test_plots)
        else:
            pass
    empty_chart_plot = alt.hconcat(*empty_chart)
    return empty_chart_plot

one = make_hconcat_amino_acid([580,582,583])
two = make_hconcat_amino_acid([584,586,587])
three = make_hconcat_amino_acid([588,589])

final_plot = alt.vconcat(one,two,three).configure_title(anchor='middle')
final_plot.display()
In [26]:
for site in list(range(580,590)):
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        test_plots = plot_affinity_individual_mutants(tmp_df,site)
        test_plots.display()
    else:
        pass

Plot correlations between each site for mean value¶

In [27]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = (
        df.groupby("site")
        .agg(
            {
                "effect_E2": "mean",
                "effect_E3": "mean",
                "binding_mean_E2": "mean",
                "binding_mean_E3": "mean",
                "wildtype": "first",
            }
        )
        .reset_index().round(2)
    )
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["binding_mean_E2"], means["binding_mean_E3"]
    )
    r_value = float(r_value)
    chart = (
        alt.Chart(
            means,
            title=alt.Title(
                "Correlation between Aggregate Mutant Binding Scores",
                subtitle=f"r={r_value:.2f}",
            ),
        )
        .mark_point(size=50, opacity=0.5,filled=True,color='gray')
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title=("Mean bEFNB2 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            y=alt.Y(
                "binding_mean_E3",
                title=("Mean bEFNB3 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            tooltip=[
                "site",
                "wildtype",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    )
    text = (
        alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=0, dy=0)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart.properties(width=200,height=200)
    return chart_and_text


E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_site_corr.save(E2_E3_correlation_site)

Make plot showing binding by site¶

In [28]:
def plot_affinity_by_site(df):
     # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "binding_mean": row["binding_mean"], "site": row["site"],"selection": row["selection"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(2)
    
    # Setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )
    #make chart
    chart = alt.Chart(agg_means_df).mark_bar(stroke='black',size=1.5,binSpacing=0,color='black').encode(
        alt.X("site")
            .title("Site")
            .axis(tickCount=5,labelAngle=-90,grid=True),
            #.scale(domain=[70, 602]),
        
        alt.Y('mean(binding_mean)')
            .axis(tickCount=3)
            .title('Mean binding'),
        alt.Row('selection',title=None),
        tooltip=['site'],
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        color=alt.Color('region',sort=custom_order,title='Region'),
        #strokeColor=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),


    ).properties(height=150, width=800).add_params(variant_selector).resolve_scale(y='independent')
    

    return chart


binding_by_site = plot_affinity_by_site(df_binding_effect_concat)
binding_by_site.display()

Make bar chart showing max binding score for each contact residue¶

In [29]:
def plot_affinity_by_contact_site(df, sites_to_show):
    # Filter the DataFrame based on the sites to show
    contact_df = df[df["site"].isin(sites_to_show)]
    
    # Define a selection for highlighting bars on hover
    variant_selector = alt.selection_point(on="mouseover", nearest=True, empty=False, fields=["site"])
    
    # Create the chart
    chart = alt.Chart(contact_df).mark_bar(stroke='black').encode(
        y=alt.Y("site:N", title="Site",sort='-x'),
        x=alt.X("max(binding_mean):Q", title="Max binding mutant at site"),
        color=alt.condition(variant_selector, alt.value("orange"), alt.value("black")),
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        column=alt.Column('selection',title=None)
    ).add_params(variant_selector).properties(width=200, height=alt.Step(12)).resolve_scale(x='independent',y='shared').configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=3
    )
    
    return chart
In [30]:
contact_binding_by_site = plot_affinity_by_contact_site(df_binding_effect_concat, config["contact_sites"])
contact_binding_by_site.display()

Make bubble plots for binding in different areas of receptor pocket¶

In [31]:
def make_bubble_binding_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
        "Receptor Contact": config["contact_sites"],
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact"]
    agg_means = []

    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "site": row["site"],
                    "mutant": row["mutant"],
                    "selection": row["selection"],
                }
            )
        agg_means_df = pd.DataFrame(agg_means)

    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
    )

    chart = alt.Chart(agg_means_df).mark_point(stroke="black",filled=True).encode(
        x=alt.X(
            "region:O",
            sort=custom_order,
            title="RBP Region",
            axis=alt.Axis(labelAngle=-90),
        ),
        y=alt.Y(
            "binding_mean",
            title="Binding",
            axis=alt.Axis(tickCount=4),
        ),
        xOffset="random:Q",
        tooltip=["region", "binding_mean", "site", "mutant"],
        color=alt.condition(
            variant_selector, alt.value("orange"), alt.value("black")
        ),
        opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
        strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
        size=alt.condition(variant_selector, alt.value(70), alt.value(20)),
        column=alt.Column('selection',title=None)
    ).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())"
    ).properties(
        width=300, 
        height=300
    ).add_params(variant_selector
    ).resolve_scale(y='independent'
    ).configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=1
    )
    
    return chart

entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()

Plot effects of selected mutants on cell entry¶

In [32]:
def plot_effects_of_mutants(df):
    tmp_df = df.copy()
    tmp_df = tmp_df[
        ((tmp_df['site'] == 598) & (tmp_df['mutant'] == 'G')) |
        ((tmp_df['site'] == 553) & (tmp_df['mutant'] == 'W')) |
        ((tmp_df['site'] == 211) & (tmp_df['mutant'] == 'F')) |
        ((tmp_df['site'] == 546) & (tmp_df['mutant'] == 'H')) |
        ((tmp_df['site'] == 143) & (tmp_df['mutant'] == 'Q')) |
        ((tmp_df['site'] == 331) & (tmp_df['mutant'] == 'E')) |
        ((tmp_df['site'] == 566) & (tmp_df['mutant'] == 'C')) 
    ]
    tmp_df['label'] = (tmp_df['wildtype'].astype(str) + tmp_df['site'].astype(str) + tmp_df['mutant'].astype(str))

    tmp_df = tmp_df.sort_values(by='site')
    binding_chart = alt.Chart(tmp_df).mark_bar().encode(
        alt.X("label:N",title='Mutant',sort=None),
        alt.Y("binding_mean:Q",title='Binding score',axis=alt.Axis(tickCount=4)),
        xOffset="selection:N",
        color=alt.Color("selection:N",title='Receptor selection'),
    )
    binding_chart.resolve_scale(y='independent',x='shared').display()
plot_effects_of_mutants(df_binding_effect_concat)
In [ ]: