Correlation between library A and B functional effects for finalized selections¶
This notebook calculates the correlation between functional effects measured in each library using the finalized selections that are averaged in the end.
In [1]:
# Imports
import os
import math
import numpy
import yaml
import matplotlib.colors
import scipy as sp
import altair as alt
import pandas as pd
import plotnine as p9
import dms_variants.utils
import dms_variants.codonvarianttable
# Create color palette
def color_gradient_hex(start, end, n):
"""Color function from polyclonal"""
cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
name="_", colors=[start, end], N=n
)
return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]
# Orange to white to blue color gradient
orangeblue = color_gradient_hex("#E69F00", "white", n=20) + color_gradient_hex("white", "#0072B2", n=20)
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
libA_1 = None
libA_2 = None
libA_3 = None
libA_4 = None
libB_1 = None
libB_2 = None
libB_3 = None
libB_4 = None
effects_dir = None
MTS = None
In [3]:
# Parameters
libA_1 = "LibA-220823-293T-1"
libA_2 = "LibA-220823-293T-2"
libA_3 = "LibA-220907-293T-1"
libA_4 = "LibA-220907-293T-2"
libB_1 = "LibB-220823-293T-1"
libB_2 = "LibB-220823-293T-2"
libB_3 = "LibB-220907-293T-1"
libB_4 = "LibB-220907-293T-2"
effects_dir = "results/func_effects/by_selection/"
MTS = 2
In [4]:
# # Uncomment for running interactive
# libA_1 = "LibA-220823-293T-1"
# libA_2 = "LibA-220823-293T-2"
# libA_3 = "LibA-220907-293T-1"
# libA_4 = "LibA-220907-293T-2"
# libB_1 = "LibB-220823-293T-1"
# libB_2 = "LibB-220823-293T-2"
# libB_3 = "LibB-220907-293T-1"
# libB_4 = "LibB-220907-293T-2"
# effects_dir = "../results/func_effects/by_selection/"
# MTS = 2
In [5]:
# Selection names
selections = [
libA_1,
libA_2,
libA_3,
libA_4,
libB_1,
libB_2,
libB_3,
libB_4,
]
# Read and concat all func effect files
func_effects = pd.concat(
[
pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]
# Average functional effects
func_effects = (
func_effects.groupby(["wildtype", "site", "mutant", "library"])
.aggregate({
"times_seen" : "mean",
"functional_effect" : "median",
})
.reset_index()
)
# Rearrange dataframe to have separate columns for each library
func_effects_A = (
func_effects.loc[func_effects["library"] == "LibA"]
.reset_index(drop=True)
)
func_effects_B = (
func_effects.loc[func_effects["library"] == "LibB"]
.reset_index(drop=True)
)
merged_df = (
func_effects_A.merge(
func_effects_B,
how="inner",
on=["site", "wildtype", "mutant"],
suffixes=["_libA", "_libB"],
validate="one_to_one",
)
)
# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_libA", "times_seen_libB"]].mean(axis=1)
In [6]:
# # Calculate statistics
# r, p = sp.stats.pearsonr(
# merged_df.loc[(merged_df["average_times_seen"] >= MTS)]["functional_effect_libA"],
# merged_df.loc[(merged_df["average_times_seen"] >= MTS)]["functional_effect_libB"]
# )
# print(f"r correlation LibA vs libB (min_times_seen={MTS}): {r:.2f}")
# print(f"r^2 correlation LibA vs libB (min_times_seen={MTS}): {r**2:.2f}")
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)
# Plot data
corr_chart = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"functional_effect_libA",
axis=alt.Axis(
title="lib A",
values=[-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.75,1.5])
),
alt.Y(
"functional_effect_libB",
axis=alt.Axis(
title="lib B",
values=[-4, -3, -2, -1, 0, 1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.75,1.5])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.1)
),
tooltip=[
"site",
"wildtype",
"mutant",
"functional_effect_libA",
"times_seen_libA",
"functional_effect_libB",
"times_seen_libB",
"average_times_seen",
],
).properties(
width=300,
height=300
).add_params(
selector
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
corr_chart
Out[6]:
Also create a small correlation heatmap using similar code as in the main pipeline. This smaller figure probably will fit in the manuscript figure better than the scatter plot.
In [7]:
# Read and concat all func effect files
func_effects = pd.concat(
[
pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]
# Average functional effects
func_effects = (
func_effects.groupby(["wildtype", "site", "mutant", "library"])
.aggregate({
"times_seen" : "mean",
"functional_effect" : "median",
})
.reset_index()
)
func_effects_tidy = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
id_vars=["library", "mutation", "times_seen"],
value_vars=["functional_effect"],
var_name="phenotype",
value_name="effect",
)
# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
[
func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
for t in [1, MTS, 2 * MTS]
]
)
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_effects_tidy,
sample_col="library",
label_col="mutation",
value_col="effect",
group_cols=["min_times_seen"],
)
.assign(r=lambda x: x["correlation"])
.drop(columns="correlation")
.assign(
min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
)
)
corr_chart = (
alt.Chart(corrs)
.encode(
alt.X("library_1", title=None),
alt.Y("library_2", title=None),
column=alt.Column("min_times_seen", title=None),
color=alt.Color("r", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "r" else c
for c in ["library_1", "library_2", "r"]
],
)
.mark_rect(stroke="black")
.properties(
width=alt.Step(40),
height=alt.Step(40)
)
.configure_axis(labelLimit=500)
)
display(corr_chart)
Recreate same plot above but formatted for paper
In [8]:
# Read and concat all func effect files
func_effects = pd.concat(
[
pd.read_csv(effects_dir + s + "_func_effects.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
# Extract library for each selection
func_effects["library"] = func_effects["selection"].str[:4]
# Average functional effects
func_effects = (
func_effects.groupby(["wildtype", "site", "mutant", "library"])
.aggregate({
"times_seen" : "mean",
"functional_effect" : "median",
})
.reset_index()
)
func_effects_tidy = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
id_vars=["library", "mutation", "times_seen"],
value_vars=["functional_effect"],
var_name="phenotype",
value_name="effect",
)
# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
[
func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
for t in [MTS]
]
)
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_effects_tidy,
sample_col="library",
label_col="mutation",
value_col="effect",
group_cols=["min_times_seen"],
)
.assign(r=lambda x: x["correlation"])
.drop(columns="correlation")
.assign(
min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
)
)
corr_chart = (
alt.Chart(corrs)
.encode(
alt.X("library_1", title=None),
alt.Y("library_2", title=None),
column=alt.Column("min_times_seen", title=None),
color=alt.Color("r", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "r" else c
for c in ["library_1", "library_2", "r"]
],
)
.mark_rect(stroke="black")
.properties(
width=alt.Step(10),
height=alt.Step(10)
)
.configure_axis(
labelLimit=500,
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
)
)
display(corr_chart)