Average mutation functional effects for a condition¶
Import Python modules.
We use polyclonal for the plotting:
In [1]:
import itertools
import math
import altair as alt
import dms_variants.utils
import pandas as pd
import polyclonal
import polyclonal.plot
This notebook is parameterized by papermill.
The next cell is tagged as parameters to get the passed parameters.
In [2]:
# this cell is tagged parameters for `papermill` parameterization
site_numbering_map_csv = None
mutation_annotations_csv = None
func_effects_csv = None
latent_effects_csv = None
latent_html = None
functional_html = None
params = None
In [3]:
# Parameters
params = {
"avg_method": "median",
"per_selection_tooltips": True,
"plot_kwargs": {
"alphabet": [
"R",
"K",
"H",
"D",
"E",
"Q",
"N",
"S",
"T",
"Y",
"W",
"F",
"A",
"I",
"L",
"M",
"V",
"G",
"P",
"C",
"*",
],
"addtl_slider_stats": {"times_seen": 2},
"heatmap_max_at_least": 2,
"heatmap_min_fixed": -5,
"init_floor_at_zero": False,
"init_site_statistic": "mean",
"site_zoom_bar_color_col": "region",
"slider_binding_range_kwargs": {
"times_seen": {"step": 1, "min": 1, "max": 25},
"n_selections": {"step": 1},
},
},
"selections": [
"A-240129-func-1",
"A-240129-func-2",
"A-240222-func-1",
"A-240222-func-2",
"A-240222-func-3",
"B-240501-func-1",
"B-240501-func-2",
"B-240501-func-3",
],
}
mutation_annotations_csv = None
site_numbering_map_csv = "data/site_numbering_map.csv"
func_effects_csv = "results/func_effects/averages/HEK293T_entry_func_effects.csv"
latent_effects_csv = "results/func_effects/averages/HEK293T_entry_latent_effects.csv"
functional_html = "results/func_effects/averages/HEK293T_entry_func_effects.html"
latent_html = "results/func_effects/averages/HEK293T_entry_latent_effects.html"
Read the input data:
In [4]:
site_numbering_map = pd.read_csv(site_numbering_map_csv).rename(
columns={"reference_site": "site"}
)
assert site_numbering_map[["site", "sequential_site"]].notnull().all().all()
addtl_site_cols = [
c for c in site_numbering_map.columns if c != "site" and c.endswith("site")
]
if mutation_annotations_csv:
mutation_annotations = pd.read_csv(mutation_annotations_csv)
selections = params["selections"]
func_effects = pd.concat(
[
pd.read_csv(f"results/func_effects/by_selection/{s}_func_effects.csv").assign(
selection=s
)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
Correlations among selections¶
Compute the correlations in the mutation effects in terms of functional scores and latent-phenotype effects across selections:
In [5]:
# We compute for several times seen values, get those:
try:
init_times_seen = params["plot_kwargs"]["addtl_slider_stats"]["times_seen"]
except KeyError:
print("No times seen in params, using a value of 3")
init_times_seen = 3
func_effects_tidy = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
id_vars=["selection", "mutation", "times_seen"],
value_vars=["latent_phenotype_effect", "functional_effect"],
var_name="phenotype",
value_name="effect",
)
# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
[
func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
for t in [1, init_times_seen, 2 * init_times_seen]
]
)
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_effects_tidy,
sample_col="selection",
label_col="mutation",
value_col="effect",
group_cols=["phenotype", "min_times_seen"],
)
.assign(r2=lambda x: x["correlation"] ** 2)
.drop(columns="correlation")
.assign(
min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
)
)
for phenotype, phenotype_corr in corrs.groupby("phenotype"):
corr_chart = (
alt.Chart(phenotype_corr)
.encode(
alt.X("selection_1", title=None),
alt.Y("selection_2", title=None),
column=alt.Column("min_times_seen", title=None),
color=alt.Color("r2", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "r2" else c
for c in ["phenotype", "selection_1", "selection_2", "r2"]
],
)
.mark_rect(stroke="black")
.properties(width=alt.Step(15), height=alt.Step(15), title=phenotype)
.configure_axis(labelLimit=500)
)
display(corr_chart)
Make a scatter plot:
In [6]:
print(f"Correlation scatter plot for times_seen filter of {init_times_seen}")
corr_panels = []
for sel1, sel2 in itertools.combinations(sorted(func_effects["selection"].unique()), 2):
df = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).query("times_seen >= @init_times_seen")
corr_df = (
df.query("selection == @sel1")[["mutation", "functional_effect"]]
.rename(columns={"functional_effect": sel1})
.merge(
df.query("selection == @sel2")[["mutation", "functional_effect"]].rename(
columns={"functional_effect": sel2}
),
validate="one_to_one",
)
)
n = len(corr_df)
r = corr_df[[sel1, sel2]].corr().values[1, 0]
corr_panels.append(
alt.Chart(corr_df)
.encode(
alt.X(sel1, scale=alt.Scale(nice=False, padding=4)),
alt.Y(sel2, scale=alt.Scale(nice=False, padding=4)),
tooltip=[
"mutation",
alt.Tooltip(sel1, format=".3f"),
alt.Tooltip(sel2, format=".3f"),
],
)
.mark_circle(color="black", size=30, opacity=0.25)
.properties(
width=160,
height=160,
title=alt.TitleParams(
f"R = {r:.2f}, N = {n}", fontSize=11, fontWeight="normal", dy=2
),
)
)
ncols = 4
corr_rows = []
for irow in range(int(math.ceil(len(corr_panels) / ncols))):
corr_rows.append(
alt.hconcat(
*[
corr_panels[irow * ncols + icol]
for icol in range(min(ncols, len(corr_panels[irow * ncols :])))
]
)
)
alt.vconcat(*corr_rows).configure_axis(grid=False)
Correlation scatter plot for times_seen filter of 2
Out[6]: