plot_BLI_data.ipynb¶

Analyzes correlation between BLI binding measurements and DMS

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
BLI_corr_plot = None

binding_E2_file = None
binding_E3_file = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
BLI_corr_plot = "results/images/binding_BLI_corr.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
In [4]:
##hard paths in case don't want to run with snakemake
if BLI_corr_plot is None:
    print("loading hard paths")
    nipah_config = "nipah_config.yaml"
    
    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
In [5]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q D -0.7817 1.6460 3.25 -1.164 0.8890 4.500 1.0 Negative
1 71 Q E 0.1659 0.4309 3.50 -1.255 0.3123 5.375 1.0 Negative
2 71 Q F -0.3429 0.5299 3.00 -1.058 0.6637 4.625 1.0 Aromatic
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q C -0.1332 0.1525 3.0 -0.7227 0.7828 3.000 1.0 Special
1 71 Q D -0.2937 0.3305 4.0 -0.3884 0.6369 3.429 1.0 Negative
2 71 Q E -0.3527 0.1762 4.0 -0.2482 0.9791 4.571 1.0 Negative
In [6]:
### load BLI data and merge with binding from DMS
BLI_df = pd.read_csv('data/custom_analyses_data/experimental_data/BLI_data.csv')

#merge
BLI_w_E2 = pd.merge(BLI_df,df_E2_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant'],how='left')
df = pd.merge(BLI_w_E2,df_E3_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant','wildtype'],how='left',suffixes=['_E2','_E3'])

#Make mutation column
df['mutation'] = (df['wildtype'].astype(str) + df['site'].astype(str) + df['mutant'].astype(str)).astype(object)
df.drop(['site','mutant','wildtype'],axis=1,inplace=True)

#Add unmutated information
new_row = {'B2_1': 0, 'B2_2': 0, 'B3_1': 0, 'B3_2': 0, 'binding_mean_E2': 0, 'binding_mean_E3': 0, 'mutation': 'Unmutated'}
new_df = pd.DataFrame([new_row])

#combine
df = pd.concat([df,new_df])
display(df)
B2_1 B2_2 B3_1 B3_2 binding_mean_E2 binding_mean_E3 mutation
0 1.562500 3.339844 -5.419813 -6.871482 0.09378 -0.29390 V244W
1 -1.929012 -0.527344 -37.849845 -34.380863 -0.07562 -0.14020 L305W
2 -3.587963 -1.328125 146.112839 155.863039 -0.21260 1.26100 Q492L
3 -1.581790 3.066406 44.757885 35.905253 0.67810 0.19680 V507I
4 -25.810185 -26.523438 -24.855620 -20.333021 -4.01500 0.08437 Q530F
5 7.793210 11.992188 56.352732 80.886492 1.94500 0.99430 S553W
6 -6.732253 -5.996094 -20.746335 -21.998124 -1.43500 -0.61140 D555Y
7 3.993056 4.238281 42.536650 23.006567 1.19500 0.28830 Q559R
8 -8.738426 -7.382812 -45.335406 -53.939962 -1.82300 -0.29660 I588V
0 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 Unmutated
In [7]:
df['B2_mean'] = df[['B2_1', 'B2_2']].mean(axis=1)
df['B3_mean'] = df[['B3_1', 'B3_2']].mean(axis=1)
df['B2_std'] = df[['B2_1','B2_2']].std(axis=1)
df['B3_std'] = df[['B3_1','B3_2']].std(axis=1)
df['B2_upper'] = df['B2_mean'] + df['B2_std']
df['B2_lower'] = df['B2_mean'] - df['B2_std']
df['B3_upper'] = df['B3_mean'] + df['B3_std']
df['B3_lower'] = df['B3_mean'] - df['B3_std']

display(df)
B2_1 B2_2 B3_1 B3_2 binding_mean_E2 binding_mean_E3 mutation B2_mean B3_mean B2_std B3_std B2_upper B2_lower B3_upper B3_lower
0 1.562500 3.339844 -5.419813 -6.871482 0.09378 -0.29390 V244W 2.451172 -6.145648 1.256772 1.026485 3.707944 1.194400 -5.119163 -7.172133
1 -1.929012 -0.527344 -37.849845 -34.380863 -0.07562 -0.14020 L305W -1.228178 -36.115354 0.991129 2.452940 -0.237049 -2.219307 -33.662413 -38.568294
2 -3.587963 -1.328125 146.112839 155.863039 -0.21260 1.26100 Q492L -2.458044 150.987939 1.597947 6.894433 -0.860097 -4.055991 157.882372 144.093506
3 -1.581790 3.066406 44.757885 35.905253 0.67810 0.19680 V507I 0.742308 40.331569 3.286771 6.259756 4.029079 -2.544463 46.591326 34.071813
4 -25.810185 -26.523438 -24.855620 -20.333021 -4.01500 0.08437 Q530F -26.166812 -22.594320 0.504346 3.197960 -25.662465 -26.671158 -19.396360 -25.792281
5 7.793210 11.992188 56.352732 80.886492 1.94500 0.99430 S553W 9.892699 68.619612 2.969126 17.347988 12.861824 6.923573 85.967600 51.271624
6 -6.732253 -5.996094 -20.746335 -21.998124 -1.43500 -0.61140 D555Y -6.364173 -21.372229 0.520543 0.885148 -5.843630 -6.884717 -20.487081 -22.257378
7 3.993056 4.238281 42.536650 23.006567 1.19500 0.28830 Q559R 4.115668 32.771608 0.173401 13.809855 4.289069 3.942268 46.581463 18.961754
8 -8.738426 -7.382812 -45.335406 -53.939962 -1.82300 -0.29660 I588V -8.060619 -49.637684 0.958563 6.084340 -7.102056 -9.019183 -43.553345 -55.722024
0 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 Unmutated 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
In [8]:
##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B2_mean"], df["binding_mean_E2"]
)
r_value_E2 = float(r_value)
print(r_value_E2)

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B3_mean"], df["binding_mean_E3"]
)
r_value_E3 = float(r_value)
print(r_value_E3)
0.9763779669852755
0.9057068049246291
In [9]:
# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
    # Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
    def extract_number(mutation):
        num = re.search(r"\d+", mutation)
        return int(num.group()) if num else 0

    array = sorted(array, key=extract_number)

    # Move 'WT' to the beginning of the list
    if "Unmutated" in array:
        array.remove("Unmutated")
        array.insert(0, "Unmutated")
    return array


# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(df["mutation"].unique()) - 1]

# Create the Altair chart
corr_chart = alt.Chart(df, title=alt.Title("bEFNB2")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E2:Q",
            title="DMS Binding",
            #scale=alt.Scale(domain=[-4, 2]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B2_mean",
            title="Binding measured by BLI",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)

min_effect_E2 = int(df["binding_mean_E2"].min())
max_mean_luciferase_E2 = int(df["B2_mean"].max())

text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E2:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-10,  # Adjust this for position
        dy=-20,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E2",
            y=alt.Y("B2_lower",title='Binding measured by BLI'),
            y2="B2_upper",
            color="mutation",
        )
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')

final_chart_E2 = vline + hline + corr_chart  + error + text
final_chart = final_chart_E2.properties(height=200,width=200)
final_chart
Out[9]:
In [10]:
# Create the Altair chart
corr_chart_E3 = alt.Chart(df, title=alt.Title("bEFNB3")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E3:Q",
            title="DMS Binding",
            scale=alt.Scale(domain=[-1, 1.5]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B3_mean",
            title="Binding measured by BLI",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)
min_effect_E2 = int(df["binding_mean_E3"].min())
max_mean_luciferase_E2 = int(df["B3_mean"].max())

text_E3 = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E3:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-70,  # Adjust this for position
        dy=0,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)

error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E3",
            y=alt.Y("B3_lower", title="Binding measured by BLI"),
            y2="B3_upper",
            color="mutation",
        )
)

# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')

final_chart_E3 = vline + hline + corr_chart_E3  + error + text_E3 
final_chart_E3 = final_chart_E3.properties(height=200,width=200)
final_chart_E3
Out[10]:
In [11]:
combined = alt.hconcat(final_chart_E2, final_chart_E3)#.resolve_scale(y='shared')
combined.display()
combined.save(BLI_corr_plot)
In [ ]:
 
In [ ]: