Correlation between single mutant variant scores and global epistasis functional effects¶

This notebook analyzes the correlation between the raw functional scores for barcoded variants with single amino-acid mutations and the functional effects for the amino-acid mutations as determined by a global epistasis model.

In [1]:
# Imports
import os
import math
import numpy
import yaml
import matplotlib.colors
import scipy as sp
import altair as alt
import pandas as pd
import plotnine as p9
import dms_variants.codonvarianttable

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Orange to white to blue color gradient
orangeblue = color_gradient_hex("#E69F00", "white", n=20) + color_gradient_hex("white", "#0072B2", n=20)

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
libA_1  = None
libA_2  = None
libA_3  = None
libA_4  = None

libB_1  = None
libB_2  = None
libB_3  = None
libB_4  = None

scores_dir = None
effects_dir = None
In [3]:
# Parameters
libA_1 = "LibA-220823-293T-1"
libA_2 = "LibA-220823-293T-2"
libA_3 = "LibA-220907-293T-1"
libA_4 = "LibA-220907-293T-2"
libB_1 = "LibB-220823-293T-1"
libB_2 = "LibB-220823-293T-2"
libB_3 = "LibB-220907-293T-1"
libB_4 = "LibB-220907-293T-2"
scores_dir = "results/func_scores/"
effects_dir = "results/func_effects/by_selection/"
In [4]:
# # Uncomment for running interactive
# libA_1 = "LibA-220823-293T-1"
# libA_2 = "LibA-220823-293T-2"
# libA_3 = "LibA-220907-293T-1"
# libA_4 = "LibA-220907-293T-2"

# libB_1 = "LibB-220823-293T-1"
# libB_2 = "LibB-220823-293T-2"
# libB_3 = "LibB-220907-293T-1"
# libB_4 = "LibB-220907-293T-2"

# scores_dir = "../results/func_scores/"
# effects_dir = "../results/func_effects/by_selection/"

Read and process data for both functional scores and functional effects. The functional scores are clipped at the lower (median of stop codons) and upper (2) ends.

In [5]:
# Selection names
selections = [
    libA_1,
    libA_2,
    libA_3,
    libA_4,
    libB_1,
    libB_2,
    libB_3,
    libB_4,
]

# Read and concat all func score files
func_scores = pd.concat(
    [
        pd.read_csv(scores_dir + s + "_func_scores.csv").assign(selection=s) 
        for s in selections
    ],
    ignore_index=True,
)

# Extract single mutants and make unique wildtype, site, and mutant columns
func_scores = (
    func_scores.query("n_aa_substitutions == 1")
    .reset_index(drop=True)
)
func_scores["wildtype"] = func_scores["aa_substitutions"].str[0]
func_scores["site"] = func_scores["aa_substitutions"].str[1:-1].astype("Int64")
func_scores["mutant"] = func_scores["aa_substitutions"].str[-1]

# Get counts of times each single mutant is observed and add to dataframe
mutant_counts = (
    func_scores[["aa_substitutions", "wildtype", "site", "mutant", "barcode", "selection"]]
    # .drop_duplicates()
    .groupby(["aa_substitutions", "wildtype", "site", "mutant", "selection"])
    .size()
    .reset_index(name="num_times_in_single_mutant_variants")
    .groupby(["aa_substitutions", "wildtype", "site", "mutant"])
    .aggregate({
        "num_times_in_single_mutant_variants" : "mean"
    })
    .reset_index()
)
func_scores = (
    func_scores.merge(
        mutant_counts,
        how="inner",
        on=["aa_substitutions", "site", "wildtype", "mutant"],
        validate="many_to_one",
    )
)

# Print the number of sinlge mutant variants
print(f"Total number of mutations measured with single mutant variants: {len(func_scores['aa_substitutions'].unique())}")

# Clip func score values 
lower_floor = func_scores.loc[(func_scores["wildtype"] != "*") & (func_scores["mutant"] == "*")]["func_score"].median()
print(f"functional scores are clipped on the lower end at {lower_floor} (median of stop codons) and on the upper end at 2")
func_scores["func_score"] = func_scores["func_score"].clip(lower=lower_floor, upper=2)

# Average func scores
func_scores = (
    func_scores.groupby(["wildtype", "site", "mutant"])
    .aggregate({
        "func_score" : "median",
        "num_times_in_single_mutant_variants" : "first", 
    })
    .reset_index()
)

# Read and concat all func effect files
func_effects = pd.concat(
    [
        pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s) 
        for s in selections
    ],
    ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))

# Average functional effects
func_effects = (
    func_effects.groupby(["wildtype", "site", "mutant"])
    .aggregate({
        "times_seen" : "mean",
        "functional_effect" : "median",
    })
    .reset_index()
)

# Merge functional scores and functional effects
merged_df = (
    func_scores.merge(
        func_effects,
        how="inner",
        on=["site", "wildtype", "mutant"],
        validate="one_to_one",
    )
)
Total number of mutations measured with single mutant variants: 7566
functional scores are clipped on the lower end at -4.752 (median of stop codons) and on the upper end at 2

Plot correlation between functional scores and functional effects.

In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(
    merged_df["func_score"], 
    merged_df["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")

# Plot data
corr_chart = alt.Chart(
    merged_df,
    title = "mutations seen in >= 1 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.15).encode(
    alt.X(
        "func_score",
        axis=alt.Axis(
            title=["variant scores for barcoded", "variants with single amino-acid mutations"], 
            values=[-4, -3, -2, -1, 0, 1, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.9,2.25])
    ),
    alt.Y(
        "functional_effect",
        axis=alt.Axis(
            title=["functional effects for mutations", "determined by global epistasis model"], 
            values=[-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5,1.5])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "func_score",
        "functional_effect",
        alt.Tooltip(
            "num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
        ),
        alt.Tooltip(
            "times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
        ),
    ],
).properties(
    width=300,
    height=300
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

corr_chart
r correlation functional scores vs functional effects: 0.92
r^2 correlation functional scores vs functional effects: 0.84
Out[6]:

Create the same correlation plot but subsetted on mutations that were observed in >= 2 single mutant barcoded variants.

In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(
    merged_df.query("num_times_in_single_mutant_variants >= 2")["func_score"], 
    merged_df.query("num_times_in_single_mutant_variants >= 2")["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")

# Plot data
corr_chart = alt.Chart(
    merged_df.query("num_times_in_single_mutant_variants >= 2"),
    title = "mutations seen in >= 2 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.15).encode(
    alt.X(
        "func_score",
        axis=alt.Axis(
            title=["variant scores for barcoded", "variants with single amino-acid mutations"], 
            values=[-4, -3, -2, -1, 0, 1, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.9,2.25])
    ),
    alt.Y(
        "functional_effect",
        axis=alt.Axis(
            title=["functional effects for mutations", "determined by global epistasis model"], 
            values=[-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5,1.5])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "func_score",
        "functional_effect",
        alt.Tooltip(
            "num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
        ),
        alt.Tooltip(
            "times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
        ),
    ],
).properties(
    width=300,
    height=300
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

corr_chart
r correlation functional scores vs functional effects: 0.95
r^2 correlation functional scores vs functional effects: 0.91
Out[7]:

Recreate same plot above but formatted for paper

In [8]:
# Calculate statistics
r, p = sp.stats.pearsonr(
    merged_df.query("num_times_in_single_mutant_variants >= 2")["func_score"], 
    merged_df.query("num_times_in_single_mutant_variants >= 2")["functional_effect"]
)
print(f"r correlation functional scores vs functional effects: {r:.2f}")
print(f"r^2 correlation functional scores vs functional effects: {r**2:.2f}")

# Plot data
corr_chart = alt.Chart(
    merged_df.query("num_times_in_single_mutant_variants >= 2"),
    title = "mutations seen in >= 2 single mutant variants"
).mark_point(filled=True, color="black", opacity=0.1, size=10).encode(
    alt.X(
        "func_score",
        axis=alt.Axis(
            title=["variant scores for barcoded", "variants with single amino-acid mutations"], 
            values=[-4, -3, -2, -1, 0, 1, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.9,2.25])
    ),
    alt.Y(
        "functional_effect",
        axis=alt.Axis(
            title=["functional effects for mutations", "determined by global epistasis model"], 
            values=[-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5,1.5])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "func_score",
        "functional_effect",
        alt.Tooltip(
            "num_times_in_single_mutant_variants", format=".2f", title="times seen in single mutant variants averaged across selections",
        ),
        alt.Tooltip(
            "times_seen", format=".2f", title="times seen in all barcoded variants averaged across selections",
        ),
    ],
).properties(
    width=90,
    height=90
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=12,
).configure_view(
    stroke=None
)

corr_chart
r correlation functional scores vs functional effects: 0.95
r^2 correlation functional scores vs functional effects: 0.91
Out[8]: