Analyze the functional scores¶
Analyze the functional scores from the functional selections.
First, import Python modules:
import altair as alt
import dms_variants.codonvarianttable
import dms_variants.utils
import numpy
import scipy.stats
import pandas as pd
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
The next cell is tagged as parameters for papermill parameterization:
selections = None
# Parameters
selections = {
"293T-Mxra8-E3E2-B-241028-func-1": {
"post_selection_sample": "E3E2_B-241028-293T_Mxra8_no_antibody-1",
"pre_selection_sample": "E3E2_B-241028-293T_Mxra8_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-E3E2-B-241028-func-2": {
"post_selection_sample": "E3E2_B-241028-293T_Mxra8_no_antibody-2",
"pre_selection_sample": "E3E2_B-241028-293T_Mxra8_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-E3E2-B-241028-func-1": {
"post_selection_sample": "E3E2_B-241028-293T_TIM1_no_antibody-1",
"pre_selection_sample": "E3E2_B-241028-293T_TIM1_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-E3E2-B-241028-func-2": {
"post_selection_sample": "E3E2_B-241028-293T_TIM1_no_antibody-2",
"pre_selection_sample": "E3E2_B-241028-293T_TIM1_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-6KE1-B-241028-func-1": {
"post_selection_sample": "6KE1_B-241028-293T_Mxra8_no_antibody-1",
"pre_selection_sample": "6KE1_B-241028-293T_Mxra8_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-6KE1-B-241028-func-2": {
"post_selection_sample": "6KE1_B-241028-293T_Mxra8_no_antibody-2",
"pre_selection_sample": "6KE1_B-241028-293T_Mxra8_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-6KE1-B-241028-func-1": {
"post_selection_sample": "6KE1_B-241028-293T_TIM1_no_antibody-1",
"pre_selection_sample": "6KE1_B-241028-293T_TIM1_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-6KE1-B-241028-func-2": {
"post_selection_sample": "6KE1_B-241028-293T_TIM1_no_antibody-2",
"pre_selection_sample": "6KE1_B-241028-293T_TIM1_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-E3E2-A-241113-func-1": {
"post_selection_sample": "E3E2_A-241113-293T_Mxra8_no_antibody-1",
"pre_selection_sample": "E3E2_A-241113-293T_Mxra8_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-E3E2-A-241113-func-2": {
"post_selection_sample": "E3E2_A-241113-293T_Mxra8_no_antibody-2",
"pre_selection_sample": "E3E2_A-241113-293T_Mxra8_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-E3E2-A-241113-func-1": {
"post_selection_sample": "E3E2_A-241113-293T_TIM1_no_antibody-1",
"pre_selection_sample": "E3E2_A-241113-293T_TIM1_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-E3E2-A-241113-func-2": {
"post_selection_sample": "E3E2_A-241113-293T_TIM1_no_antibody-2",
"pre_selection_sample": "E3E2_A-241113-293T_TIM1_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-6KE1-A-241113-func-1": {
"post_selection_sample": "6KE1_A-241113-293T_Mxra8_no_antibody-1",
"pre_selection_sample": "6KE1_A-241113-293T_Mxra8_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-Mxra8-6KE1-A-241113-func-2": {
"post_selection_sample": "6KE1_A-241113-293T_Mxra8_no_antibody-2",
"pre_selection_sample": "6KE1_A-241113-293T_Mxra8_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-6KE1-A-241113-func-1": {
"post_selection_sample": "6KE1_A-241113-293T_TIM1_no_antibody-1",
"pre_selection_sample": "6KE1_A-241113-293T_TIM1_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"293T-TIM1-6KE1-A-241113-func-2": {
"post_selection_sample": "6KE1_A-241113-293T_TIM1_no_antibody-2",
"pre_selection_sample": "6KE1_A-241113-293T_TIM1_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-E3E2-B-241122-func-1": {
"post_selection_sample": "E3E2_B-241122-C636-1",
"pre_selection_sample": "E3E2_B-241122-C636_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-E3E2-B-241122-func-2": {
"post_selection_sample": "E3E2_B-241122-C636-2",
"pre_selection_sample": "E3E2_B-241122-C636_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-6KE1-B-241122-func-1": {
"post_selection_sample": "6KE1_B-241122-C636-1",
"pre_selection_sample": "6KE1_B-241122-C636_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-6KE1-B-241122-func-2": {
"post_selection_sample": "6KE1_B-241122-C636-2",
"pre_selection_sample": "6KE1_B-241122-C636_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-E3E2-A-241212-func-1": {
"post_selection_sample": "E3E2_A-241212-C636-1",
"pre_selection_sample": "E3E2_A-241212-C636_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-E3E2-A-241212-func-2": {
"post_selection_sample": "E3E2_A-241212-C636-2",
"pre_selection_sample": "E3E2_A-241212-C636_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-6KE1-A-241212-func-1": {
"post_selection_sample": "6KE1_A-241212-C636-1",
"pre_selection_sample": "6KE1_A-241212-C636_VSVG-1",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
"C636-6KE1-A-241212-func-2": {
"post_selection_sample": "6KE1_A-241212-C636-2",
"pre_selection_sample": "6KE1_A-241212-C636_VSVG-2",
"func_score_params": {
"pseudocount": 0.5,
"min_wt_count": 1000,
"min_wt_frac": 0.001,
"min_pre_selection_count": 10,
"min_pre_selection_frac": "1e-06",
},
"global_epistasis_params": {
"clip_lower": -8,
"clip_upper": None,
"collapse_identical_variants": False,
"latent_effects_regularization": "1e-07",
},
},
}
Read in all the data:
count_summaries = pd.concat(
[pd.read_csv(f"results/func_scores/{s}_count_summary.csv") for s in selections],
ignore_index=True,
)
func_scores = pd.concat(
[
pd.read_csv(f"results/func_scores/{s}_func_scores.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
)
Pre- and post-selection counts for variants¶
Plot the pre-selection and post-selection sample count distributions for the variants.
The boxes span the first to third quartiles, the black line is the median, and the thin horizontal line spans the full range.
For the pre-selection counts the red line is the minimum counts: anything below that (with fewer pre-selection counts) is not included in the set of functional scores that are calculated.
Note that the x-axis uses a symlog scale.
You can use the options at the bottom to subset to specific samples.
charts = []
for count_type in ["pre", "post"]:
base_chart = (
alt.Chart(count_summaries)
.encode(
y=alt.Y("selection", title=None, axis=alt.Axis(labels=count_type == "pre")),
tooltip=count_summaries.columns.tolist(),
)
.properties(
width=275,
height=alt.Step(15),
title=f"{count_type}-selection counts",
)
)
quantile_bar = base_chart.encode(
x=alt.X(
f"{count_type}_count_q1",
scale=alt.Scale(type="symlog", constant=20),
axis=alt.Axis(labelOverlap=True),
title=f"{count_type}-selection counts for variant",
),
x2=f"{count_type}_count_q3",
).mark_bar(color="blue", height={"band": 0.8})
range_line = base_chart.encode(
x=f"{count_type}_count_min",
x2=f"{count_type}_count_max",
).mark_rule(color="blue", opacity=0.5)
median_line = base_chart.encode(
x=f"{count_type}_count_median",
x2=f"{count_type}_count_median",
).mark_bar(xOffset=1, x2Offset=-1, color="black", height={"band": 0.8})
if count_type == "pre":
threshold_line = base_chart.encode(
x="min_pre_selection_count",
x2="min_pre_selection_count",
).mark_bar(xOffset=1, x2Offset=-1, color="red", height={"band": 0.8})
charts.append(quantile_bar + range_line + median_line + threshold_line)
else:
charts.append(quantile_bar + range_line + median_line)
count_summary_chart = alt.hconcat(*charts)
selectors = [
alt.selection_point(
fields=[sel],
bind=alt.binding_select(
options=[None] + sorted(count_summaries[sel].unique()),
labels=["all"] + sorted(count_summaries[sel].unique()),
name=sel,
),
)
for sel in ["library", "pre_selection_date", "post_selection_date"]
]
for sel in selectors:
count_summary_chart = count_summary_chart.add_params(sel).transform_filter(sel)
count_summary_chart = count_summary_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
count_summary_chart
Distributions of functional scores¶
Plot the functional scores distribution among retained variants (adequate pre-selection counts).
These are plotted as ridgeplots.
# classify variants
func_scores = func_scores.pipe(
dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)
def ridgeplot(df):
variant_classes = list(
reversed(
[
c
for c in [
"wildtype",
"synonymous",
"1 nonsynonymous",
">1 nonsynonymous",
"deletion",
"stop",
]
if c in set(df["variant_class"])
]
)
)
assert set(df["variant_class"]) == set(variant_classes)
# get smoothed distribution of functional scores
bins = numpy.linspace(
df["func_score"].min(),
df["func_score"].max(),
num=50,
)
smoothed_dist = pd.concat(
[
pd.DataFrame(
{
"selection": sel,
"variant_class": var,
"func_score": bins,
"count": scipy.stats.gaussian_kde(df["func_score"])(bins),
"mean_func_score": df["func_score"].mean(),
"number of variants": len(df),
}
)
for (sel, var), df in df.groupby(["selection", "variant_class"])
]
).merge(
count_summaries[
["selection", "library", "pre_selection_date", "post_selection_date"]
],
on="selection",
validate="many_to_one",
)
# assign y / y2 for plotting
facet_overlap = 0.7 # maximal facet overlap
max_count = (smoothed_dist["count"]).max()
smoothed_dist = smoothed_dist.assign(
y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
)
# ridgeline plot, based on this but using y / y2 rather than row:
# https://altair-viz.github.io/gallery/ridgeline_plot.html
ridgeline_chart = (
alt.Chart(smoothed_dist)
.encode(
x=alt.X(
"func_score", title="functional score", scale=alt.Scale(nice=False)
),
y=alt.Y(
"y",
scale=alt.Scale(nice=False),
title=None,
axis=alt.Axis(
ticks=False,
domain=False,
# set manual labels https://stackoverflow.com/a/64106056
values=[v + 0.5 for v in range(len(variant_classes))],
labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
),
),
y2=alt.Y2("y2"),
fill=alt.Fill(
"mean_func_score:Q",
title="mean functional score",
legend=alt.Legend(direction="horizontal"),
scale=alt.Scale(scheme="yellowgreenblue"),
),
facet=alt.Facet(
"selection",
columns=4,
title=None,
header=alt.Header(
labelFontWeight="bold",
labelPadding=0,
),
),
tooltip=[
"selection",
"variant_class",
alt.Tooltip(
"mean_func_score", format=".2f", title="mean functional score"
),
"number of variants",
"library",
"pre_selection_date",
"post_selection_date",
],
)
.mark_area(
interpolate="monotone",
smooth=True,
fillOpacity=0.8,
stroke="lightgray",
strokeWidth=0.5,
)
.configure_view(stroke=None)
.configure_axis(grid=False)
.properties(width=180, height=22 * len(variant_classes))
)
for sel in selectors:
ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)
ridgeline_chart = ridgeline_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
return ridgeline_chart
ridgeplot(func_scores)
Correlations in variant functional scores¶
Correlations among functional scores of variants with the same barcode in the same library:
pre_selection_dates = count_summaries.set_index("selection")[
"pre_selection_date"
].to_dict()
post_selection_dates = count_summaries.set_index("selection")[
"post_selection_date"
].to_dict()
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_scores.merge(
count_summaries[["selection", "library"]],
on="selection",
validate="many_to_one",
),
sample_col="selection",
label_col="barcode",
value_col="func_score",
group_cols="library",
)
.rename(columns={"correlation": "r"})
.assign(
r2=lambda x: x["r"] ** 2,
pre_date_1=lambda x: x["selection_1"].map(pre_selection_dates),
pre_date_2=lambda x: x["selection_2"].map(pre_selection_dates),
pre_selection_date=lambda x: x["pre_date_1"].where(
x["pre_date_1"] == x["pre_date_2"], pd.NA
),
post_date_1=lambda x: x["selection_1"].map(post_selection_dates),
post_date_2=lambda x: x["selection_2"].map(post_selection_dates),
post_selection_date=lambda x: x["post_date_1"].where(
x["post_date_1"] == x["post_date_2"], pd.NA
),
)
.drop(columns=["pre_date_1", "pre_date_2", "post_date_1", "post_date_2"])
)
for library, library_corr in corrs.groupby("library"):
corr_chart = (
alt.Chart(library_corr)
.encode(
alt.X("selection_1", title=None),
alt.Y("selection_2", title=None),
color=alt.Color("r2", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c in ["r", "r2"] else c
for c in corrs.columns
],
)
.mark_rect(stroke="black")
.properties(width=alt.Step(15), height=alt.Step(15), title=library)
.configure_axis(labelLimit=500)
)
for sel in selectors:
corr_chart = corr_chart.add_params(sel).transform_filter(sel)
corr_chart = corr_chart.properties(autosize=alt.AutoSizeParams(resize=True))
display(corr_chart)