Correlation between library A and B functional effects for finalized selections¶

This notebook calculates the correlation between functional effects measured in each library using the finalized selections that are averaged in the end.

In [1]:
# Imports
import os
import math
import numpy
import yaml
import matplotlib.colors
import scipy as sp
import altair as alt
import pandas as pd
import plotnine as p9
import dms_variants.utils
import dms_variants.codonvarianttable

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Orange to white to blue color gradient
orangeblue = color_gradient_hex("#E69F00", "white", n=20) + color_gradient_hex("white", "#0072B2", n=20)

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
libA_1  = None
libA_2  = None
libA_3  = None
libA_4  = None

libB_1  = None
libB_2  = None
libB_3  = None
libB_4  = None

effects_dir = None
MTS = None
In [3]:
# Parameters
libA_1 = "LibA-220823-293T-1"
libA_2 = "LibA-220823-293T-2"
libA_3 = "LibA-220907-293T-1"
libA_4 = "LibA-220907-293T-2"
libB_1 = "LibB-220823-293T-1"
libB_2 = "LibB-220823-293T-2"
libB_3 = "LibB-220907-293T-1"
libB_4 = "LibB-220907-293T-2"
effects_dir = "results/func_effects/by_selection/"
MTS = 2
In [4]:
# # Uncomment for running interactive
# libA_1 = "LibA-220823-293T-1"
# libA_2 = "LibA-220823-293T-2"
# libA_3 = "LibA-220907-293T-1"
# libA_4 = "LibA-220907-293T-2"

# libB_1 = "LibB-220823-293T-1"
# libB_2 = "LibB-220823-293T-2"
# libB_3 = "LibB-220907-293T-1"
# libB_4 = "LibB-220907-293T-2"

# effects_dir = "../results/func_effects/by_selection/"
# MTS = 2
In [5]:
# Selection names
selections = [
    libA_1,
    libA_2,
    libA_3,
    libA_4,
    libB_1,
    libB_2,
    libB_3,
    libB_4,
]

# Read and concat all func effect files
func_effects = pd.concat(
    [
        pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s) 
        for s in selections
    ],
    ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))

# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]

# Average functional effects
func_effects = (
    func_effects.groupby(["wildtype", "site", "mutant", "library"])
    .aggregate({
        "times_seen" : "mean",
        "functional_effect" : "median",
    })
    .reset_index()
)

# Rearrange dataframe to have separate columns for each library
func_effects_A = (
    func_effects.loc[func_effects["library"] == "LibA"]
    .reset_index(drop=True)
)
func_effects_B = (
    func_effects.loc[func_effects["library"] == "LibB"]
    .reset_index(drop=True)
)
merged_df = (
    func_effects_A.merge(
        func_effects_B,
        how="inner",
        on=["site", "wildtype", "mutant"],
        suffixes=["_libA", "_libB"],
        validate="one_to_one",
    )
)

# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_libA", "times_seen_libB"]].mean(axis=1) 
In [6]:
# # Calculate statistics
# r, p = sp.stats.pearsonr(
#     merged_df.loc[(merged_df["average_times_seen"] >= MTS)]["functional_effect_libA"], 
#     merged_df.loc[(merged_df["average_times_seen"] >= MTS)]["functional_effect_libB"]
# )
# print(f"r correlation LibA vs libB (min_times_seen={MTS}): {r:.2f}")
# print(f"r^2 correlation LibA vs libB (min_times_seen={MTS}): {r**2:.2f}")


slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)

# Plot data
corr_chart = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "functional_effect_libA",
        axis=alt.Axis(
            title="lib A", 
            values=[-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.75,1.5])
    ),
    alt.Y(
        "functional_effect_libB",
        axis=alt.Axis(
            title="lib B", 
            values=[-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.75,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "functional_effect_libA",
        "times_seen_libA",
        "functional_effect_libB",
        "times_seen_libB",
        "average_times_seen",
    ],
).properties(
    width=300,
    height=300
).add_params(
   selector
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

corr_chart
Out[6]:

Also create a small correlation heatmap using similar code as in the main pipeline. This smaller figure probably will fit in the manuscript figure better than the scatter plot.

In [7]:
# Read and concat all func effect files
func_effects = pd.concat(
    [
        pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s) 
        for s in selections
    ],
    ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))

# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]

# Average functional effects
func_effects = (
    func_effects.groupby(["wildtype", "site", "mutant", "library"])
    .aggregate({
        "times_seen" : "mean",
        "functional_effect" : "median",
    })
    .reset_index()
)

func_effects_tidy = func_effects.assign(
    mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
    id_vars=["library", "mutation", "times_seen"],
    value_vars=["functional_effect"],
    var_name="phenotype",
    value_name="effect",
)

# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
    [
        func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
        for t in [1, MTS, 2 * MTS]
    ]
)

corrs = (
    dms_variants.utils.tidy_to_corr(
        df=func_effects_tidy,
        sample_col="library",
        label_col="mutation",
        value_col="effect",
        group_cols=["min_times_seen"],
    )
    .assign(r=lambda x: x["correlation"])
    .drop(columns="correlation")
    .assign(
        min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
    )
)

corr_chart = (
    alt.Chart(corrs)
    .encode(
        alt.X("library_1", title=None),
        alt.Y("library_2", title=None),
        column=alt.Column("min_times_seen", title=None),
        color=alt.Color("r", scale=alt.Scale(zero=True)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c == "r" else c
            for c in ["library_1", "library_2", "r"]
        ],
    )
    .mark_rect(stroke="black")
    .properties(
        width=alt.Step(40), 
        height=alt.Step(40)
    )
    .configure_axis(labelLimit=500)
)

display(corr_chart)

Recreate same plot above but formatted for paper

In [8]:
# Read and concat all func effect files
func_effects = pd.concat(
    [
        pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s) 
        for s in selections
    ],
    ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))

# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]

# Average functional effects
func_effects = (
    func_effects.groupby(["wildtype", "site", "mutant", "library"])
    .aggregate({
        "times_seen" : "mean",
        "functional_effect" : "median",
    })
    .reset_index()
)

func_effects_tidy = func_effects.assign(
    mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
    id_vars=["library", "mutation", "times_seen"],
    value_vars=["functional_effect"],
    var_name="phenotype",
    value_name="effect",
)

# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
    [
        func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
        for t in [MTS]
    ]
)

corrs = (
    dms_variants.utils.tidy_to_corr(
        df=func_effects_tidy,
        sample_col="library",
        label_col="mutation",
        value_col="effect",
        group_cols=["min_times_seen"],
    )
    .assign(r=lambda x: x["correlation"])
    .drop(columns="correlation")
    .assign(
        min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
    )
)

corr_chart = (
    alt.Chart(corrs)
    .encode(
        alt.X("library_1", title=None),
        alt.Y("library_2", title=None),
        column=alt.Column("min_times_seen", title=None),
        color=alt.Color("r", scale=alt.Scale(zero=True)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c == "r" else c
            for c in ["library_1", "library_2", "r"]
        ],
    )
    .mark_rect(stroke="black")
    .properties(
        width=alt.Step(10), 
        height=alt.Step(10)
    )
    .configure_axis(
        labelLimit=500,
        grid=False,
        labelFontSize=8,
        titleFontSize=8,
        labelFontWeight="normal",
    )
)

display(corr_chart)