plot_BLI_data.ipynb¶
Analyzes correlation between BLI binding measurements and DMS
- Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
BLI_corr_plot = None
binding_E2_file = None
binding_E3_file = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
BLI_corr_plot = "results/images/binding_BLI_corr.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
In [4]:
##hard paths in case don't want to run with snakemake
if BLI_corr_plot is None:
print("loading hard paths")
nipah_config = "nipah_config.yaml"
# input files
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
In [5]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 | Negative |
1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 | Negative |
2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 | Aromatic |
site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | mutant_type | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 71 | Q | C | -0.1332 | 0.1525 | 3.0 | -0.7227 | 0.7828 | 3.000 | 1.0 | Special |
1 | 71 | Q | D | -0.2937 | 0.3305 | 4.0 | -0.3884 | 0.6369 | 3.429 | 1.0 | Negative |
2 | 71 | Q | E | -0.3527 | 0.1762 | 4.0 | -0.2482 | 0.9791 | 4.571 | 1.0 | Negative |
In [6]:
### load BLI data and merge with binding from DMS
BLI_df = pd.read_csv('data/custom_analyses_data/experimental_data/BLI_data.csv')
#merge
BLI_w_E2 = pd.merge(BLI_df,df_E2_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant'],how='left')
df = pd.merge(BLI_w_E2,df_E3_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant','wildtype'],how='left',suffixes=['_E2','_E3'])
#Make mutation column
df['mutation'] = (df['wildtype'].astype(str) + df['site'].astype(str) + df['mutant'].astype(str)).astype(object)
df.drop(['site','mutant','wildtype'],axis=1,inplace=True)
#Add unmutated information
new_row = {'B2_1': 0, 'B2_2': 0, 'B3_1': 0, 'B3_2': 0, 'binding_mean_E2': 0, 'binding_mean_E3': 0, 'mutation': 'Unmutated'}
new_df = pd.DataFrame([new_row])
#combine
df = pd.concat([df,new_df])
display(df)
B2_1 | B2_2 | B3_1 | B3_2 | binding_mean_E2 | binding_mean_E3 | mutation | |
---|---|---|---|---|---|---|---|
0 | 1.562500 | 3.339844 | -5.419813 | -6.871482 | 0.09378 | -0.29390 | V244W |
1 | -1.929012 | -0.527344 | -37.849845 | -34.380863 | -0.07562 | -0.14020 | L305W |
2 | -3.587963 | -1.328125 | 146.112839 | 155.863039 | -0.21260 | 1.26100 | Q492L |
3 | -1.581790 | 3.066406 | 44.757885 | 35.905253 | 0.67810 | 0.19680 | V507I |
4 | -25.810185 | -26.523438 | -24.855620 | -20.333021 | -4.01500 | 0.08437 | Q530F |
5 | 7.793210 | 11.992188 | 56.352732 | 80.886492 | 1.94500 | 0.99430 | S553W |
6 | -6.732253 | -5.996094 | -20.746335 | -21.998124 | -1.43500 | -0.61140 | D555Y |
7 | 3.993056 | 4.238281 | 42.536650 | 23.006567 | 1.19500 | 0.28830 | Q559R |
8 | -8.738426 | -7.382812 | -45.335406 | -53.939962 | -1.82300 | -0.29660 | I588V |
0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | Unmutated |
In [7]:
df['B2_mean'] = df[['B2_1', 'B2_2']].mean(axis=1)
df['B3_mean'] = df[['B3_1', 'B3_2']].mean(axis=1)
df['B2_std'] = df[['B2_1','B2_2']].std(axis=1)
df['B3_std'] = df[['B3_1','B3_2']].std(axis=1)
df['B2_upper'] = df['B2_mean'] + df['B2_std']
df['B2_lower'] = df['B2_mean'] - df['B2_std']
df['B3_upper'] = df['B3_mean'] + df['B3_std']
df['B3_lower'] = df['B3_mean'] - df['B3_std']
display(df)
B2_1 | B2_2 | B3_1 | B3_2 | binding_mean_E2 | binding_mean_E3 | mutation | B2_mean | B3_mean | B2_std | B3_std | B2_upper | B2_lower | B3_upper | B3_lower | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.562500 | 3.339844 | -5.419813 | -6.871482 | 0.09378 | -0.29390 | V244W | 2.451172 | -6.145648 | 1.256772 | 1.026485 | 3.707944 | 1.194400 | -5.119163 | -7.172133 |
1 | -1.929012 | -0.527344 | -37.849845 | -34.380863 | -0.07562 | -0.14020 | L305W | -1.228178 | -36.115354 | 0.991129 | 2.452940 | -0.237049 | -2.219307 | -33.662413 | -38.568294 |
2 | -3.587963 | -1.328125 | 146.112839 | 155.863039 | -0.21260 | 1.26100 | Q492L | -2.458044 | 150.987939 | 1.597947 | 6.894433 | -0.860097 | -4.055991 | 157.882372 | 144.093506 |
3 | -1.581790 | 3.066406 | 44.757885 | 35.905253 | 0.67810 | 0.19680 | V507I | 0.742308 | 40.331569 | 3.286771 | 6.259756 | 4.029079 | -2.544463 | 46.591326 | 34.071813 |
4 | -25.810185 | -26.523438 | -24.855620 | -20.333021 | -4.01500 | 0.08437 | Q530F | -26.166812 | -22.594320 | 0.504346 | 3.197960 | -25.662465 | -26.671158 | -19.396360 | -25.792281 |
5 | 7.793210 | 11.992188 | 56.352732 | 80.886492 | 1.94500 | 0.99430 | S553W | 9.892699 | 68.619612 | 2.969126 | 17.347988 | 12.861824 | 6.923573 | 85.967600 | 51.271624 |
6 | -6.732253 | -5.996094 | -20.746335 | -21.998124 | -1.43500 | -0.61140 | D555Y | -6.364173 | -21.372229 | 0.520543 | 0.885148 | -5.843630 | -6.884717 | -20.487081 | -22.257378 |
7 | 3.993056 | 4.238281 | 42.536650 | 23.006567 | 1.19500 | 0.28830 | Q559R | 4.115668 | 32.771608 | 0.173401 | 13.809855 | 4.289069 | 3.942268 | 46.581463 | 18.961754 |
8 | -8.738426 | -7.382812 | -45.335406 | -53.939962 | -1.82300 | -0.29660 | I588V | -8.060619 | -49.637684 | 0.958563 | 6.084340 | -7.102056 | -9.019183 | -43.553345 | -55.722024 |
0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | Unmutated | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
In [8]:
##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["B2_mean"], df["binding_mean_E2"]
)
r_value_E2 = float(r_value)
print(r_value_E2)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["B3_mean"], df["binding_mean_E3"]
)
r_value_E3 = float(r_value)
print(r_value_E3)
0.9763779669852755 0.9057068049246291
In [9]:
# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
# Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
def extract_number(mutation):
num = re.search(r"\d+", mutation)
return int(num.group()) if num else 0
array = sorted(array, key=extract_number)
# Move 'WT' to the beginning of the list
if "Unmutated" in array:
array.remove("Unmutated")
array.insert(0, "Unmutated")
return array
# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]
# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(df["mutation"].unique()) - 1]
# Create the Altair chart
corr_chart = alt.Chart(df, title=alt.Title("bEFNB2")).mark_point(size=125,filled=True,opacity=1).encode(
x=alt.X(
"binding_mean_E2:Q",
title="DMS Binding",
#scale=alt.Scale(domain=[-4, 2]),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"B2_mean",
title="Binding measured by BLI",
#scale=alt.Scale(type="log", base=10),
axis=alt.Axis(tickCount=4),
# format=".0e", tickCount=4
#), # Display in scientific notation
),
color=alt.Color(
"mutation",
title="Variant",
scale=alt.Scale(
domain=custom_sort_order(df["mutation"].unique()), range=colors
),
),
)
min_effect_E2 = int(df["binding_mean_E2"].min())
max_mean_luciferase_E2 = int(df["B2_mean"].max())
text = (
alt.Chart(
{
"values": [
{
"x": min_effect_E2,
"y": max_mean_luciferase_E2,
"text": f"r = {r_value_E2:.2f}",
}
]
}
)
.mark_text(
align="left",
baseline="top",
dx=-10, # Adjust this for position
dy=-20, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
error = (
alt.Chart(df)
.mark_errorbar(opacity=1)
.encode(
x="binding_mean_E2",
y=alt.Y("B2_lower",title='Binding measured by BLI'),
y2="B2_upper",
color="mutation",
)
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
final_chart_E2 = vline + hline + corr_chart + error + text
final_chart = final_chart_E2.properties(height=200,width=200)
final_chart
Out[9]:
In [10]:
# Create the Altair chart
corr_chart_E3 = alt.Chart(df, title=alt.Title("bEFNB3")).mark_point(size=125,filled=True,opacity=1).encode(
x=alt.X(
"binding_mean_E3:Q",
title="DMS Binding",
scale=alt.Scale(domain=[-1, 1.5]),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"B3_mean",
title="Binding measured by BLI",
#scale=alt.Scale(type="log", base=10),
axis=alt.Axis(tickCount=4),
# format=".0e", tickCount=4
#), # Display in scientific notation
),
color=alt.Color(
"mutation",
title="Variant",
scale=alt.Scale(
domain=custom_sort_order(df["mutation"].unique()), range=colors
),
),
)
min_effect_E2 = int(df["binding_mean_E3"].min())
max_mean_luciferase_E2 = int(df["B3_mean"].max())
text_E3 = (
alt.Chart(
{
"values": [
{
"x": min_effect_E2,
"y": max_mean_luciferase_E2,
"text": f"r = {r_value_E3:.2f}",
}
]
}
)
.mark_text(
align="left",
baseline="top",
dx=-70, # Adjust this for position
dy=0, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
error = (
alt.Chart(df)
.mark_errorbar(opacity=1)
.encode(
x="binding_mean_E3",
y=alt.Y("B3_lower", title="Binding measured by BLI"),
y2="B3_upper",
color="mutation",
)
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
final_chart_E3 = vline + hline + corr_chart_E3 + error + text_E3
final_chart_E3 = final_chart_E3.properties(height=200,width=200)
final_chart_E3
Out[10]:
In [11]:
combined = alt.hconcat(final_chart_E2, final_chart_E3)#.resolve_scale(y='shared')
combined.display()
combined.save(BLI_corr_plot)
In [ ]:
In [ ]: