Correlation between single mutant variant scores and global epistasis functional effects¶
This notebook analyzes the correlation between the raw functional scores for barcoded variants with single amino-acid mutations and the functional effects for the amino-acid mutations as determined by a global epistasis model.
In [1]:
# Imports
import os
import math
import numpy
import yaml
import matplotlib.colors
import scipy as sp
import altair as alt
import pandas as pd
import plotnine as p9
import dms_variants.codonvarianttable
# Create color palette
def color_gradient_hex(start, end, n):
"""Color function from polyclonal"""
cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
name="_", colors=[start, end], N=n
)
return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]
# Orange to white to blue color gradient
orangeblue = color_gradient_hex("#E69F00", "white", n=20) + color_gradient_hex("white", "#0072B2", n=20)
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
libA_1 = None
libA_2 = None
libA_3 = None
libA_4 = None
libB_1 = None
libB_2 = None
libB_3 = None
libB_4 = None
scores_dir = None
effects_dir = None
In [3]:
# Parameters
libA_1 = "LibA-220823-293T-1"
libA_2 = "LibA-220823-293T-2"
libA_3 = "LibA-220907-293T-1"
libA_4 = "LibA-220907-293T-2"
libB_1 = "LibB-220823-293T-1"
libB_2 = "LibB-220823-293T-2"
libB_3 = "LibB-220907-293T-1"
libB_4 = "LibB-220907-293T-2"
scores_dir = "results/func_scores/"
effects_dir = "results/func_effects/by_selection/"
In [4]:
# # Uncomment for running interactive
# libA_1 = "LibA-220823-293T-1"
# libA_2 = "LibA-220823-293T-2"
# libA_3 = "LibA-220907-293T-1"
# libA_4 = "LibA-220907-293T-2"
# libB_1 = "LibB-220823-293T-1"
# libB_2 = "LibB-220823-293T-2"
# libB_3 = "LibB-220907-293T-1"
# libB_4 = "LibB-220907-293T-2"
# scores_dir = "../results/func_scores/"
# effects_dir = "../results/func_effects/by_selection/"
Read and process data for both functional scores and functional effects. The functional scores are clipped at the lower (median of stop codons) and upper (2) ends.
In [5]:
# Selection names
selections = [
libA_1,
libA_2,
libA_3,
libA_4,
libB_1,
libB_2,
libB_3,
libB_4,
]
# Read and concat all func score files
func_scores = pd.concat(
[
pd.read_csv(scores_dir + s + "_func_scores.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
)
# Extract single mutants and make unique wildtype, site, and mutant columns
func_scores = (
func_scores.query("n_aa_substitutions == 1")
.reset_index(drop=True)
)
func_scores["wildtype"] = func_scores["aa_substitutions"].str[0]
func_scores["site"] = func_scores["aa_substitutions"].str[1:-1].astype("Int64")
func_scores["mutant"] = func_scores["aa_substitutions"].str[-1]
# Get counts of times each single mutant is observed and add to dataframe
mutant_counts = (
func_scores[["aa_substitutions", "wildtype", "site", "mutant", "barcode", "selection"]]
# .drop_duplicates()
.groupby(["aa_substitutions", "wildtype", "site", "mutant", "selection"])
.size()
.reset_index(name="num_times_in_single_mutant_variants")
.groupby(["aa_substitutions", "wildtype", "site", "mutant"])
.aggregate({
"num_times_in_single_mutant_variants" : "mean"
})
.reset_index()
)
func_scores = (
func_scores.merge(
mutant_counts,
how="inner",
on=["aa_substitutions", "site", "wildtype", "mutant"],
validate="many_to_one",
)
)
# Print the number of sinlge mutant variants
print(f"Total number of mutations measured with single mutant variants: {len(func_scores['aa_substitutions'].unique())}")
# Clip func score values
lower_floor = func_scores.loc[(func_scores["wildtype"] != "*") & (func_scores["mutant"] == "*")]["func_score"].median()
print(f"functional scores are clipped on the lower end at {lower_floor} (median of stop codons) and on the upper end at 2")
func_scores["func_score"] = func_scores["func_score"].clip(lower=lower_floor, upper=2)
# Average func scores
func_scores = (
func_scores.groupby(["wildtype", "site", "mutant"])
.aggregate({
"func_score" : "median",
"num_times_in_single_mutant_variants" : "first",
})
.reset_index()
)
# Read and concat all func effect files
func_effects = pd.concat(
[
pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
# Average functional effects
func_effects = (
func_effects.groupby(["wildtype", "site", "mutant"])
.aggregate({
"times_seen" : "mean",
"functional_effect" : "median",
})
.reset_index()
)
# Merge functional scores and functional effects
merged_df = (
func_scores.merge(
func_effects,
how="inner",
on=["site", "wildtype", "mutant"],
validate="one_to_one",
)
)
Total number of mutations measured with single mutant variants: 7566 functional scores are clipped on the lower end at -4.752 (median of stop codons) and on the upper end at 2
Plot correlation between functional scores and functional effects.
In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(
merged_df["func_score"],
merged_df["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")
# Plot data
corr_chart = alt.Chart(
merged_df,
title = "mutations seen in >= 1 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.15).encode(
alt.X(
"func_score",
axis=alt.Axis(
title=["variant scores for barcoded", "variants with single amino-acid mutations"],
values=[-4, -3, -2, -1, 0, 1, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.9,2.25])
),
alt.Y(
"functional_effect",
axis=alt.Axis(
title=["functional effects for mutations", "determined by global epistasis model"],
values=[-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.5,1.5])
),
tooltip=[
"site",
"wildtype",
"mutant",
"func_score",
"functional_effect",
alt.Tooltip(
"num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
),
alt.Tooltip(
"times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
),
],
).properties(
width=300,
height=300
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
corr_chart
r correlation functional scores vs functional effects: 0.92 r^2 correlation functional scores vs functional effects: 0.84
Out[6]:
Create the same correlation plot but subsetted on mutations that were observed in >= 2 single mutant barcoded variants.
In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(
merged_df.query("num_times_in_single_mutant_variants >= 2")["func_score"],
merged_df.query("num_times_in_single_mutant_variants >= 2")["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")
# Plot data
corr_chart = alt.Chart(
merged_df.query("num_times_in_single_mutant_variants >= 2"),
title = "mutations seen in >= 2 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.15).encode(
alt.X(
"func_score",
axis=alt.Axis(
title=["variant scores for barcoded", "variants with single amino-acid mutations"],
values=[-4, -3, -2, -1, 0, 1, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.9,2.25])
),
alt.Y(
"functional_effect",
axis=alt.Axis(
title=["functional effects for mutations", "determined by global epistasis model"],
values=[-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.5,1.5])
),
tooltip=[
"site",
"wildtype",
"mutant",
"func_score",
"functional_effect",
alt.Tooltip(
"num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
),
alt.Tooltip(
"times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
),
],
).properties(
width=300,
height=300
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
corr_chart
r correlation functional scores vs functional effects: 0.95 r^2 correlation functional scores vs functional effects: 0.91
Out[7]:
Recreate same plot above but formatted for paper
In [8]:
# Calculate statistics
r, p = sp.stats.pearsonr(
merged_df.query("num_times_in_single_mutant_variants >= 2")["func_score"],
merged_df.query("num_times_in_single_mutant_variants >= 2")["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")
# Plot data
corr_chart = alt.Chart(
merged_df.query("num_times_in_single_mutant_variants >= 2"),
title = "mutations seen in >= 2 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.1, size=10).encode(
alt.X(
"func_score",
axis=alt.Axis(
title=["variant scores for barcoded", "variants with single amino-acid mutations"],
values=[-4, -3, -2, -1, 0, 1, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.9,2.25])
),
alt.Y(
"functional_effect",
axis=alt.Axis(
title=["functional effects for mutations", "determined by global epistasis model"],
values=[-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.5,1.5])
),
tooltip=[
"site",
"wildtype",
"mutant",
"func_score",
"functional_effect",
alt.Tooltip(
"num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
),
alt.Tooltip(
"times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
),
],
).properties(
width=90,
height=90
).configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=12,
).configure_view(
stroke=None
)
corr_chart
r correlation functional scores vs functional effects: 0.95 r^2 correlation functional scores vs functional effects: 0.91
Out[8]: