Analyze the functional scores¶

Analyze the functional scores from the functional selections.

First, import Python modules:

In [1]:
import altair as alt

import dms_variants.codonvarianttable
import dms_variants.utils

import numpy

import scipy.stats

import pandas as pd
In [2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

The next cell is tagged as parameters for papermill parameterization:

In [3]:
selections = None
In [4]:
# Parameters
selections = {
    "Lib1-230613-293T": {
        "pre_selection_sample": "Lib1-230613-VSVG-control",
        "post_selection_sample": "Lib1-230613-no-antibody-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-230613-293T": {
        "pre_selection_sample": "Lib2-230613-VSVG-control",
        "post_selection_sample": "Lib2-230613-no-antibody-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-240125-293-SA23": {
        "pre_selection_sample": "Lib1-240125-293-SA23-VSVG-control",
        "post_selection_sample": "Lib1-240125-293-SA23-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-240125-293-SA23": {
        "pre_selection_sample": "Lib2-240125-293-SA23-VSVG-control",
        "post_selection_sample": "Lib2-240125-293-SA23-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-240125-293-SA26": {
        "pre_selection_sample": "Lib1-240125-293-SA26-VSVG-control",
        "post_selection_sample": "Lib1-240125-293-SA26-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-240125-293-SA26": {
        "pre_selection_sample": "Lib2-240125-293-SA26-VSVG-control",
        "post_selection_sample": "Lib2-240125-293-SA26-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-240704-293": {
        "pre_selection_sample": "Lib1-240704-293-VSVG-control",
        "post_selection_sample": "Lib1-240704-293-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-240704-293": {
        "pre_selection_sample": "Lib2-240704-293-VSVG-control",
        "post_selection_sample": "Lib2-240704-293-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-240704-CMAH-293": {
        "pre_selection_sample": "Lib1-240704-CMAH-293-VSVG-control",
        "post_selection_sample": "Lib1-240704-CMAH-293-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-240704-CMAH-293": {
        "pre_selection_sample": "Lib2-240704-CMAH-293-VSVG-control",
        "post_selection_sample": "Lib2-240704-CMAH-293-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-241216-293-IGM": {
        "pre_selection_sample": "Lib1-241216-293-IGM-VSVG-control",
        "post_selection_sample": "Lib1-241216-293-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-241216-293-IGM": {
        "pre_selection_sample": "Lib2-241216-293-IGM-VSVG-control",
        "post_selection_sample": "Lib2-241216-293-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-241216-CMAH-IGM": {
        "pre_selection_sample": "Lib1-241216-CMAH-293-IGM-VSVG-control",
        "post_selection_sample": "Lib1-241216-CMAH-293-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-241216-CMAH-IGM": {
        "pre_selection_sample": "Lib2-241216-CMAH-293-IGM-VSVG-control",
        "post_selection_sample": "Lib2-241216-CMAH-293-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-241216-SA23-IGM": {
        "pre_selection_sample": "Lib1-241216-293-SA23-IGM-VSVG-control",
        "post_selection_sample": "Lib1-241216-293-SA23-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-241216-SA23-IGM": {
        "pre_selection_sample": "Lib2-241216-293-SA23-IGM-VSVG-control",
        "post_selection_sample": "Lib2-241216-293-SA23-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-241216-SA26-IGM": {
        "pre_selection_sample": "Lib1-241216-293-SA26-IGM-VSVG-control",
        "post_selection_sample": "Lib1-241216-293-SA26-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-241216-SA26-IGM": {
        "pre_selection_sample": "Lib2-241216-293-SA26-IGM-VSVG-control",
        "post_selection_sample": "Lib2-241216-293-SA26-IGM-H5-control",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib1-250504-SA26-SA23-IGM": {
        "pre_selection_sample": "Lib1-250504-H5_VSVG",
        "post_selection_sample": "Lib1-250504-H5-no_mAb",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
    "Lib2-250504-SA26-SA23-IGM": {
        "pre_selection_sample": "Lib2-250504-H5_VSVG",
        "post_selection_sample": "Lib2-250504-H5-no_mAb",
        "func_score_params": {
            "pseudocount": 0.5,
            "min_wt_count": 10000,
            "min_wt_frac": 0.001,
            "min_pre_selection_count": 20,
            "min_pre_selection_frac": "2e-06",
        },
        "global_epistasis_params": {
            "clip_lower": -6,
            "clip_upper": None,
            "collapse_identical_variants": False,
            "latent_effects_regularization": 0,
            "fit_kwargs": {"maxiter": 1000, "tol": "1e-06"},
        },
    },
}

Read in all the data:

In [5]:
count_summaries = pd.concat(
    [pd.read_csv(f"results/func_scores/{s}_count_summary.csv") for s in selections],
    ignore_index=True,
)

func_scores = pd.concat(
    [
        pd.read_csv(f"results/func_scores/{s}_func_scores.csv").assign(selection=s)
        for s in selections
    ],
    ignore_index=True,
)

Pre- and post-selection counts for variants¶

Plot the pre-selection and post-selection sample count distributions for the variants.

The boxes span the first to third quartiles, the black line is the median, and the thin horizontal line spans the full range.

For the pre-selection counts the red line is the minimum counts: anything below that (with fewer pre-selection counts) is not included in the set of functional scores that are calculated.

Note that the x-axis uses a symlog scale.

You can use the options at the bottom to subset to specific samples.

In [6]:
charts = []
for count_type in ["pre", "post"]:
    base_chart = (
        alt.Chart(count_summaries)
        .encode(
            y=alt.Y("selection", title=None, axis=alt.Axis(labels=count_type == "pre")),
            tooltip=count_summaries.columns.tolist(),
        )
        .properties(
            width=275,
            height=alt.Step(15),
            title=f"{count_type}-selection counts",
        )
    )

    quantile_bar = base_chart.encode(
        x=alt.X(
            f"{count_type}_count_q1",
            scale=alt.Scale(type="symlog", constant=20),
            axis=alt.Axis(labelOverlap=True),
            title=f"{count_type}-selection counts for variant",
        ),
        x2=f"{count_type}_count_q3",
    ).mark_bar(color="blue", height={"band": 0.8})

    range_line = base_chart.encode(
        x=f"{count_type}_count_min",
        x2=f"{count_type}_count_max",
    ).mark_rule(color="blue", opacity=0.5)

    median_line = base_chart.encode(
        x=f"{count_type}_count_median",
        x2=f"{count_type}_count_median",
    ).mark_bar(xOffset=1, x2Offset=-1, color="black", height={"band": 0.8})

    if count_type == "pre":
        threshold_line = base_chart.encode(
            x="min_pre_selection_count",
            x2="min_pre_selection_count",
        ).mark_bar(xOffset=1, x2Offset=-1, color="red", height={"band": 0.8})
        charts.append(quantile_bar + range_line + median_line + threshold_line)
    else:
        charts.append(quantile_bar + range_line + median_line)

count_summary_chart = alt.hconcat(*charts)

selectors = [
    alt.selection_point(
        fields=[sel],
        bind=alt.binding_select(
            options=[None] + sorted(count_summaries[sel].unique()),
            labels=["all"] + sorted(count_summaries[sel].unique()),
            name=sel,
        ),
    )
    for sel in ["library", "pre_selection_date", "post_selection_date"]
]

for sel in selectors:
    count_summary_chart = count_summary_chart.add_params(sel).transform_filter(sel)

count_summary_chart = count_summary_chart.properties(
    autosize=alt.AutoSizeParams(resize=True),
)

count_summary_chart
Out[6]:

Distributions of functional scores¶

Plot the functional scores distribution among retained variants (adequate pre-selection counts).

These are plotted as ridgeplots.

In [7]:
# classify variants
func_scores = func_scores.pipe(
    dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)


def ridgeplot(df):
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "deletion",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )

    assert set(df["variant_class"]) == set(variant_classes)

    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "selection": sel,
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df["func_score"])(bins),
                    "mean_func_score": df["func_score"].mean(),
                    "number of variants": len(df),
                }
            )
            for (sel, var), df in df.groupby(["selection", "variant_class"])
        ]
    ).merge(
        count_summaries[
            ["selection", "library", "pre_selection_date", "post_selection_date"]
        ],
        on="selection",
        validate="many_to_one",
    )

    # assign y / y2 for plotting
    facet_overlap = 0.7  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )

    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", title="functional score", scale=alt.Scale(nice=False)
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title="mean functional score",
                legend=alt.Legend(direction="horizontal"),
                scale=alt.Scale(scheme="yellowgreenblue"),
            ),
            facet=alt.Facet(
                "selection",
                columns=4,
                title=None,
                header=alt.Header(
                    labelFontWeight="bold",
                    labelPadding=0,
                ),
            ),
            tooltip=[
                "selection",
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean functional score"
                ),
                "number of variants",
                "library",
                "pre_selection_date",
                "post_selection_date",
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="lightgray",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(width=180, height=22 * len(variant_classes))
    )

    for sel in selectors:
        ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)

    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )

    return ridgeline_chart


ridgeplot(func_scores)
Out[7]:

Correlations in variant functional scores¶

Correlations among functional scores of variants with the same barcode in the same library:

In [8]:
pre_selection_dates = count_summaries.set_index("selection")[
    "pre_selection_date"
].to_dict()
post_selection_dates = count_summaries.set_index("selection")[
    "post_selection_date"
].to_dict()

corrs = (
    dms_variants.utils.tidy_to_corr(
        df=func_scores.merge(
            count_summaries[["selection", "library"]],
            on="selection",
            validate="many_to_one",
        ),
        sample_col="selection",
        label_col="barcode",
        value_col="func_score",
        group_cols="library",
    )
    .rename(columns={"correlation": "r"})
    .assign(
        r2=lambda x: x["r"] ** 2,
        pre_date_1=lambda x: x["selection_1"].map(pre_selection_dates),
        pre_date_2=lambda x: x["selection_2"].map(pre_selection_dates),
        pre_selection_date=lambda x: x["pre_date_1"].where(
            x["pre_date_1"] == x["pre_date_2"], pd.NA
        ),
        post_date_1=lambda x: x["selection_1"].map(post_selection_dates),
        post_date_2=lambda x: x["selection_2"].map(post_selection_dates),
        post_selection_date=lambda x: x["post_date_1"].where(
            x["post_date_1"] == x["post_date_2"], pd.NA
        ),
    )
    .drop(columns=["pre_date_1", "pre_date_2", "post_date_1", "post_date_2"])
)

for library, library_corr in corrs.groupby("library"):
    corr_chart = (
        alt.Chart(library_corr)
        .encode(
            alt.X("selection_1", title=None),
            alt.Y("selection_2", title=None),
            color=alt.Color("r2", scale=alt.Scale(zero=True)),
            tooltip=[
                alt.Tooltip(c, format=".3g") if c in ["r", "r2"] else c
                for c in corrs.columns
            ],
        )
        .mark_rect(stroke="black")
        .properties(width=alt.Step(15), height=alt.Step(15), title=library)
        .configure_axis(labelLimit=500)
    )

    for sel in selectors:
        corr_chart = corr_chart.add_params(sel).transform_filter(sel)

    corr_chart = corr_chart.properties(autosize=alt.AutoSizeParams(resize=True))

    display(corr_chart)
In [ ]: