cell_entry_validations.ipynb¶
This notebook will read in experimentally determined luciferase entry of individual mutants and plot correlation with DMS entry scores
- Written by Brendan Larsen
Papermill parameters¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None
validation_file_E2 = None
validation_file_E3 = None
func_scores_E2_file = None
func_scores_E3_file = None
func_score_E2_plot = None
func_score_E3_plot = None
corr_plots_combined = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
validation_file_E2 = (
"data/custom_analyses_data/experimental_data/functional_validations_EFNB2.csv"
)
validation_file_E3 = (
"data/custom_analyses_data/experimental_data/functional_validations_EFNB3.csv"
)
func_scores_E2_file = "results/func_effects/averages/CHO_bEFNB2_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_bEFNB3_func_effects.csv"
func_score_E2_plot = "results/images/func_score_E2_validation.html"
func_score_E3_plot = "results/images/func_score_E3_validation.html"
corr_plots_combined = "results/images/corr_plots_combined.html"
Import packages, set working directory, import altair theme¶
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
Read config file¶
In [4]:
with open("config.yaml") as f:
config = yaml.safe_load(f)
For running interactively¶
In [5]:
if corr_plots_combined is None:
nipah_config = "nipah_config.yaml"
validation_file_E2 = (
"data/custom_analyses_data/experimental_data/functional_validations_EFNB2.csv"
)
validation_file_E3 = (
"data/custom_analyses_data/experimental_data/functional_validations_EFNB3.csv"
)
func_scores_E2_file = "results/func_effects/averages/CHO_bEFNB2_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_bEFNB3_func_effects.csv"
Load in config files¶
In [6]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import luciferase (RLUs/uL) readings for each mutant¶
In [7]:
func_validations_EFNB2 = pd.read_csv(validation_file_E2, na_filter=None)
func_validations_EFNB2 = func_validations_EFNB2.rename(
columns={"mean_luciferase": "mean_luciferase_E2"}
)
func_validations_EFNB3 = pd.read_csv(validation_file_E3, na_filter=None)
func_validations_EFNB3 = func_validations_EFNB3.rename(
columns={"mean_luciferase": "mean_luciferase_E3"}
)
func_validations_EFNB3 = func_validations_EFNB3.drop("mutation", axis=1)
concat = pd.concat([func_validations_EFNB2, func_validations_EFNB3], axis=1)
display(concat.head(30))
mutation | mean_luciferase_E2 | mean_luciferase_E3 | |
---|---|---|---|
0 | Q530F | 13839 | 39157 |
1 | Q530F | 11669 | 38929 |
2 | Q530F | 12583 | 32625 |
3 | Y389T | 9223 | 18 |
4 | Y389T | 9564 | 21 |
5 | Y389T | 6327 | 30 |
6 | Q530E | 8952 | 3 |
7 | Q530E | 4825 | 2 |
8 | Q530E | 7865 | 1 |
9 | P488S | 12841 | 2012 |
10 | P488S | 1023 | 155 |
11 | P488S | 9621 | 1672 |
12 | Q530L | 13695 | 5601 |
13 | Q530L | 13589 | 5730 |
14 | Q530L | 11817 | 5136 |
15 | C162L | 1 | 30 |
16 | C162L | 4 | 9 |
17 | C162L | 1 | 20 |
18 | F168A | 7771 | 24956 |
19 | F168A | 9860 | 29080 |
20 | F168A | 6493 | 23138 |
21 | Unmutated | 8775 | 24502 |
22 | Unmutated | 9407 | 23416 |
23 | Unmutated | 9936 | 32370 |
24 | Q492L | 19403 | 42769 |
25 | Q492L | 10533 | 36135 |
26 | Q492L | 11757 | 46212 |
Now import func scores and make new column to match above data frame to merge on¶
In [8]:
func_scores = pd.read_csv(func_scores_E2_file)
func_scores["mutation"] = (
func_scores["wildtype"] + func_scores["site"].astype(str) + func_scores["mutant"]
)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
func_scores_E3["mutation"] = (
func_scores_E3["wildtype"]
+ func_scores_E3["site"].astype(str)
+ func_scores_E3["mutant"]
)
func_scores_merged = func_scores.merge(
func_scores_E3, on=["mutation"], how="left", suffixes=["_E2", "_E3"]
)
merged = concat.merge(func_scores_merged, on=["mutation"], how="left")
# Change effect of WT to very small number other than 0 so can plot on log scale
merged.loc[merged["mutation"] == "Unmutated", ["effect_E2", "effect_E3"]] = 0.0000001
Now Plot Correlations¶
CHO-EFNB2 Entry Correlations¶
In [9]:
##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
merged["effect_E2"], merged["mean_luciferase_E2"]
)
r_value = float(r_value)
# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
# Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
def extract_number(mutation):
num = re.search(r"\d+", mutation)
return int(num.group()) if num else 0
array = sorted(array, key=extract_number)
# Move 'WT' to the beginning of the list
if "Unmutated" in array:
array.remove("Unmutated")
array.insert(0, "Unmutated")
return array
# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]
# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(merged["mutation"].unique()) - 1]
# Create the Altair chart
corr_chart = (
alt.Chart(merged, title=alt.Title("CHO-bEFNB2"))
.encode(
x=alt.X(
"effect_E2:Q",
title="Cell entry in DMS",
scale=alt.Scale(domain=[-4, 1]),
axis=alt.Axis(values=[-4, -3, -2, -1, 0, 1], tickCount=6),
),
y=alt.Y(
"mean_luciferase_E2",
title="RLU/μL in validation asasy",
scale=alt.Scale(type="log", base=10),
axis=alt.Axis(
format=".0e", tickCount=4
),
),
color=alt.Color(
"mutation",
title="Virus",
scale=alt.Scale(
domain=custom_sort_order(merged["mutation"].unique()), range=colors
),
),
shape=alt.Shape('mutation',scale=alt.Scale(domain=custom_sort_order(merged["mutation"].unique()))),
tooltip=["mutation", "effect_E2", "mean_luciferase_E2"],
)
.mark_point(size=125,filled=True,opacity=0.5)
)
min_effect_E2 = int(merged["effect_E2"].min())
max_mean_luciferase_E2 = int(merged["mean_luciferase_E2"].max())
text = (
alt.Chart(
{
"values": [
{
"x": min_effect_E2,
"y": max_mean_luciferase_E2,
"text": f"r = {r_value:.2f}",
}
]
}
)
.mark_text(
align="left",
baseline="top",
dx=-30,
dy=-30,
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
# text
final_chart = corr_chart + text
final_chart = final_chart.properties(height=200,width=200).resolve_scale(shape='independent', color='independent')
func_score_E2_chart = final_chart
func_score_E2_chart.display()
if corr_plots_combined is not None:
func_score_E2_chart.save(func_score_E2_plot)
CHO-EFNB3 entry correlations plot¶
In [10]:
# calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
merged["effect_E3"], merged["mean_luciferase_E3"]
)
r_value = float(r_value)
# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
# Sort based on the numerical part in mutation strings
def extract_number(mutation):
num = re.search(r"\d+", mutation)
return int(num.group()) if num else 0
array = sorted(array, key=extract_number)
# Move 'WT' to the beginning of the list
if "WT" in array:
array.remove("WT")
array.insert(0, "WT")
return array
# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(merged["mutation"].unique()) - 1]
# Create the Altair chart
corr_chart = (
alt.Chart(merged, title=alt.Title("CHO-bEFNB3"))
.encode(
x=alt.X(
"effect_E3:Q",
title="Cell entry in DMS",
scale=alt.Scale(domain=[-4, 1]),
axis=alt.Axis(values=[-4, -3, -2, -1, 0, 1], tickCount=6),
),
y=alt.Y(
"mean_luciferase_E3",
title="RLU/μL in validation asasy",
scale=alt.Scale(type="log", base=10),
axis=alt.Axis(
format=".0e", tickCount=4
), # Display in scientific notation
),
color=alt.Color(
"mutation",
title="Virus",
scale=alt.Scale(
domain=custom_sort_order(merged["mutation"].unique()), range=colors
),
),
tooltip=["mutation", "effect_E3", "mean_luciferase_E3"],
shape=alt.Shape('mutation',scale=alt.Scale(domain=custom_sort_order(merged["mutation"].unique()))),
)
.mark_point(size=125,filled=True,opacity=0.5)
)
min_effect_E3 = int(merged["effect_E3"].min())
max_mean_luciferase_E3 = int(merged["mean_luciferase_E3"].max())
text = (
alt.Chart(
{
"values": [
{
"x": min_effect_E3,
"y": max_mean_luciferase_E3,
"text": f"r = {r_value:.2f}",
}
]
}
)
.mark_text(
align="left",
baseline="top",
dx=-30, # Adjust this for position
dy=-15, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
final_chart = (corr_chart + text).resolve_scale(shape='independent', color='independent')
final_chart = final_chart.properties(width=200,height=200)
func_score_E3_chart = final_chart
func_score_E3_chart.display()
if corr_plots_combined is not None:
func_score_E3_chart.save(func_score_E3_plot)
In [11]:
(func_score_E2_chart | func_score_E3_chart).resolve_scale(color='shared').display()
# Combine the plots
if corr_plots_combined is not None:
(func_score_E2_chart | func_score_E3_chart).save(corr_plots_combined)
In [ ]: