Average mutation functional effects for a condition¶
Import Python modules.
We use polyclonal for the plotting:
In [1]:
import itertools
import math
import altair as alt
import dms_variants.utils
import pandas as pd
import polyclonal
import polyclonal.plot
This notebook is parameterized by papermill.
The next cell is tagged as parameters to get the passed parameters.
In [2]:
# this cell is tagged parameters for `papermill` parameterization
site_numbering_map_csv = None
mutation_annotations_csv = None
func_effects_csv = None
latent_effects_csv = None
latent_html = None
functional_html = None
params = None
In [3]:
# Parameters
params = {
"avg_method": "median",
"per_selection_tooltips": True,
"plot_kwargs": {
"addtl_slider_stats": {"times_seen": 3},
"heatmap_max_at_least": 1,
"heatmap_min_at_least": -1,
"init_floor_at_zero": False,
"init_site_statistic": "mean",
"site_zoom_bar_color_col": "region",
"slider_binding_range_kwargs": {
"times_seen": {"step": 1, "min": 1, "max": 25},
"n_selections": {"step": 1},
},
"sites_to_show": {"include_range": [30, 702]},
},
"legend": "Interactive plot of the effects of mutations. Negative values indicated deleterious mutations,\npositive values indicate beneficial mutations for the measured phenotype.\n\nUse the site zoom bar at the top to zoom in on specific sites. The line plot shows a summary\nstatistic indicating the effects of mutations at each site. The heat map shows the effects of\nindividual mutations, with parental amino-acid identities indicated by x and gray\nindicating non-measured mutations.\n \nYou can mouse over points to get details about individual measurements, include measurements\nin individual selection experiments.\n\nThe options at the bottom of the plot let you modify the display, such as by selecting how\nmany different variants a mutation must be seen in to be shown (*minimum times_seen*),\nhow many different experimental selections the mutation was measured in\n(*minimum n_selections*), what site summary statistic to show, etc.\n\nThe *minimum max of effect* at site is useful to select the sites where mutations have\nthe most positive functional effects.\n",
"title": "Mutation effects on entry into TZM-bl cells",
"selections": [
"A-220720-1-1",
"A-220720-2-2",
"A-220804-1-1",
"A-220804-2-2",
"A-220927-1-1",
"A-221017-1-1",
"B-220804-1-1",
"B-220804-2-2",
"B-220912-1-1",
"B-230413-1-1",
],
}
mutation_annotations_csv = None
site_numbering_map_csv = "data/site_numbering_map.csv"
func_effects_csv = "results/func_effects/averages/TZM-bl_entry_func_effects.csv"
latent_effects_csv = "results/func_effects/averages/TZM-bl_entry_latent_effects.csv"
functional_html = "results/func_effects/averages/TZM-bl_entry_func_effects.html"
latent_html = "results/func_effects/averages/TZM-bl_entry_latent_effects.html"
Read the input data:
In [4]:
site_numbering_map = pd.read_csv(site_numbering_map_csv).rename(
columns={"reference_site": "site"}
)
assert site_numbering_map[["site", "sequential_site"]].notnull().all().all()
addtl_site_cols = [
c for c in site_numbering_map.columns if c != "site" and c.endswith("site")
]
if mutation_annotations_csv:
mutation_annotations = pd.read_csv(mutation_annotations_csv)
selections = params["selections"]
func_effects = pd.concat(
[
pd.read_csv(f"results/func_effects/by_selection/{s}_func_effects.csv").assign(
selection=s
)
for s in selections
],
ignore_index=True,
).assign(times_seen=lambda x: x["times_seen"].astype("Int64"))
Correlations among selections¶
Compute the correlations in the mutation effects in terms of functional scores and latent-phenotype effects across selections:
In [5]:
# We compute for several times seen values, get those:
try:
init_times_seen = params["plot_kwargs"]["addtl_slider_stats"]["times_seen"]
except KeyError:
print("No times seen in params, using a value of 3")
init_times_seen = 3
func_effects_tidy = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).melt(
id_vars=["selection", "mutation", "times_seen"],
value_vars=["latent_phenotype_effect", "functional_effect"],
var_name="phenotype",
value_name="effect",
)
# do analysis for each "times_seen"
func_effects_tidy = pd.concat(
[
func_effects_tidy.query("times_seen >= @t").assign(min_times_seen=t)
for t in [1, init_times_seen, 2 * init_times_seen]
]
)
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_effects_tidy,
sample_col="selection",
label_col="mutation",
value_col="effect",
group_cols=["phenotype", "min_times_seen"],
)
.assign(r2=lambda x: x["correlation"] ** 2)
.drop(columns="correlation")
.assign(
min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
)
)
for phenotype, phenotype_corr in corrs.groupby("phenotype"):
corr_chart = (
alt.Chart(phenotype_corr)
.encode(
alt.X("selection_1", title=None),
alt.Y("selection_2", title=None),
column=alt.Column("min_times_seen", title=None),
color=alt.Color("r2", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "r2" else c
for c in ["phenotype", "selection_1", "selection_2", "r2"]
],
)
.mark_rect(stroke="black")
.properties(width=alt.Step(15), height=alt.Step(15), title=phenotype)
.configure_axis(labelLimit=500)
)
display(corr_chart)
Make a scatter plot:
In [6]:
print(f"Correlation scatter plot for times_seen filter of {init_times_seen}")
corr_panels = []
for sel1, sel2 in itertools.combinations(sorted(func_effects["selection"].unique()), 2):
df = func_effects.assign(
mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"]
).query("times_seen >= @init_times_seen")
corr_df = (
df.query("selection == @sel1")[["mutation", "functional_effect"]]
.rename(columns={"functional_effect": sel1})
.merge(
df.query("selection == @sel2")[["mutation", "functional_effect"]].rename(
columns={"functional_effect": sel2}
),
validate="one_to_one",
)
)
n = len(corr_df)
r = corr_df[[sel1, sel2]].corr().values[1, 0]
corr_panels.append(
alt.Chart(corr_df)
.encode(
alt.X(sel1, scale=alt.Scale(nice=False, padding=4)),
alt.Y(sel2, scale=alt.Scale(nice=False, padding=4)),
tooltip=[
"mutation",
alt.Tooltip(sel1, format=".3f"),
alt.Tooltip(sel2, format=".3f"),
],
)
.mark_circle(color="black", size=30, opacity=0.25)
.properties(
width=160,
height=160,
title=alt.TitleParams(
f"R = {r:.2f}, N = {n}", fontSize=11, fontWeight="normal", dy=2
),
)
)
ncols = 4
corr_rows = []
for irow in range(int(math.ceil(len(corr_panels) / ncols))):
corr_rows.append(
alt.hconcat(
*[
corr_panels[irow * ncols + icol]
for icol in range(min(ncols, len(corr_panels[irow * ncols :])))
]
)
)
alt.vconcat(*corr_rows).configure_axis(grid=False)
Correlation scatter plot for times_seen filter of 3
Out[6]: