Fit polyclonal model to escape in an assay (eg, antibody selection)¶
In the notebook below, "antibody" is used as a synonym for any agent that will neutralize the viral infectivity. However, the plotting is done somewhat differently depending on the assay.
Import Python modules.
import pickle
import altair as alt
import polyclonal
import pandas as pd
This notebook is parameterized by papermill.
The next cell is tagged as parameters to get the passed parameters.
# this cell is tagged parameters for `papermill` parameterization
assay = None
selection = None
params = None
neut_standard_frac_csvs = None
prob_escape_csvs = None
assay_config = None
prob_escape_mean_csv = None
site_numbering_map_csv = None
pickle_file = None
# Parameters
params = {
    "neut_standard_name": "neut_standard",
    "prob_escape_filters": {
        "min_neut_standard_count": 1000,
        "min_neut_standard_frac": "1e-05",
        "min_no_antibody_count": 20,
        "min_no_antibody_frac": "1e-07",
        "min_antibody_count": 100,
        "min_antibody_frac": "2e-05",
        "max_aa_subs": 3,
        "clip_uncensored_prob_escape": 5,
    },
    "polyclonal_params": {
        "n_epitopes": 1,
        "spatial_distances": "results/spatial_distances/5FYK.csv",
        "fit_kwargs": {
            "reg_escape_weight": 0.2,
            "reg_spread_weight": 0.1,
            "reg_activity_weight": 1.0,
            "logfreq": 200,
        },
    },
    "escape_plot_kwargs": {
        "addtl_slider_stats": {"times_seen": 2},
        "addtl_tooltip_stats": ["sequential_site"],
        "heatmap_max_at_least": 2,
        "heatmap_min_at_least": -2,
        "init_floor_at_zero": False,
        "init_site_statistic": "sum",
        "site_zoom_bar_color_col": "region",
        "slider_binding_range_kwargs": {"times_seen": {"min": 0, "max": 20, "step": 1}},
        "sites_to_show": {"include_range": [30, 702]},
    },
    "plot_hide_stats": {
        "functional effect": {
            "csv": "results/func_effects/averages/TZM-bl_entry_func_effects.csv",
            "csv_col": "effect",
            "init": -4,
        }
    },
    "no_antibody_sample": "B-231011-rescue_6_ultra-no_antibody-1",
    "antibody_samples": {
        "B-231011-rescue_6_ultra-3BNC117-2.5-1": {
            "concentration": 2.5,
            "use_in_fit": False,
        },
        "B-231011-rescue_6_ultra-3BNC117-5.0-1": {
            "concentration": 5.0,
            "use_in_fit": True,
        },
        "B-231011-rescue_6_ultra-3BNC117-10.0-1": {
            "concentration": 10.0,
            "use_in_fit": True,
        },
        "B-231011-rescue_6_ultra-3BNC117-20.0-1": {
            "concentration": 20.0,
            "use_in_fit": True,
        },
    },
}
neut_standard_frac_csvs = [
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-2.5-1_neut_standard_fracs.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-5.0-1_neut_standard_fracs.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-10.0-1_neut_standard_fracs.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-20.0-1_neut_standard_fracs.csv",
]
prob_escape_csvs = [
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-2.5-1_prob_escape.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-5.0-1_prob_escape.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-10.0-1_prob_escape.csv",
    "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1/B-231011-rescue_6_ultra-3BNC117-20.0-1_prob_escape.csv",
]
assay_config = {
    "title": "Antibody/serum escape",
    "selections": "antibody_selections",
    "averages": "avg_antibody_escape",
    "prob_escape_scale": {"type": "symlog", "constant": 0.04},
    "scale_stat": 1,
    "stat_name": "escape",
}
site_numbering_map_csv = "data/site_numbering_map.csv"
prob_escape_mean_csv = "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1_prob_escape_mean.csv"
pickle_file = "results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1_polyclonal_model.pickle"
assay = "antibody_escape"
selection = "B-231011-rescue_6_ultra-3BNC117-1"
Read and process data¶
print(f"Analyzing data for {assay=}")
Analyzing data for assay='antibody_escape'
Convert the antibody samples into a data frame:
antibody_samples = pd.DataFrame.from_dict(
    params["antibody_samples"], orient="index"
).reset_index(names="sample")
Get other parameters:
prob_escape_filters = {k: float(v) for k, v in params["prob_escape_filters"].items()}
Read the neut standard fracs:
neut_standard_fracs = pd.concat(
    [
        pd.read_csv(f).assign(sample=sample)
        for sample, f in zip(antibody_samples["sample"], neut_standard_frac_csvs)
    ],
    ignore_index=True,
).merge(antibody_samples, validate="one_to_one", on="sample")
Read the probabilities (fraction) escape for each variant:
prob_escape = pd.concat(
    [
        pd.read_csv(f, keep_default_na=False, na_values="nan").assign(sample=sample)
        for sample, f in zip(antibody_samples["sample"], prob_escape_csvs)
    ],
    ignore_index=True,
).merge(antibody_samples, validate="many_to_one", on="sample")
Plot the neutralization standard fractions¶
Plot the neutralization standard fractions for each sample:
neut_standard_fracs_chart = (
    alt.Chart(
        neut_standard_fracs.rename(
            columns={"antibody_frac": "antibody", "no-antibody_frac": "no-antibody"}
        ).melt(
            id_vars=["sample", "use_in_fit", "concentration"],
            value_vars=["antibody", "no-antibody"],
            var_name="sample type",
            value_name="neutralization standard fraction",
        )
    )
    .encode(
        x=alt.X(
            "neutralization standard fraction",
            scale=alt.Scale(type="symlog", constant=0.04, domainMax=1),
        ),
        y=alt.Y("sample", sort=alt.SortField("concentration"), title=None),
        shape=alt.Shape("sample type", title="sample type (filled if used in fit)"),
        stroke=alt.Color(
            "sample type", scale=alt.Scale(range=["#1F77B4FF", "#FF7F0EFF"])
        ),
        color=alt.Color(
            "sample type", scale=alt.Scale(range=["#1F77B4FF", "#FF7F0EFF"])
        ),
        fillOpacity=alt.Opacity(
            "use_in_fit",
            scale=alt.Scale(domain=[True, False], range=[1, 0]),
        ),
        tooltip=[
            "sample",
            alt.Tooltip("concentration", format=".3g"),
            alt.Tooltip("neutralization standard fraction", format=".3g"),
        ],
    )
    .mark_point(filled=True, size=50)
    .configure_axis(labelLimit=500)
    .properties(title=f"Neutralization standard fractions for {selection}")
)
neut_standard_fracs_chart
Make sure all samples used in the fit have enough neutralization standard counts and fraction:
for prop in ["count", "frac"]:
    minval = float(prob_escape_filters[f"min_neut_standard_{prop}"])
    minval = float(minval)
    if all(
        (neut_standard_fracs.query("use_in_fit")[f"{stype}_{prop}"] >= minval).all()
        for stype in ["antibody", "no-antibody"]
    ):
        print(f"Adequate neut_standard_{prop} of >= {minval}")
    else:
        raise ValueError(
            f"Inadequate neut_standard_{prop} < {minval}\n{neut_standard_fracs}"
        )
Adequate neut_standard_count of >= 1000.0 Adequate neut_standard_frac of >= 1e-05
Get variants with adequate counts to retain¶
First get the minimum counts variants need to be retained: they need to meet this count threshold for either the antibody or no-antibody sample:
# get minimum counts to be retained: needs to meet these for one of the samples
min_counts = (
    prob_escape.groupby("sample", as_index=False)
    .aggregate({"antibody_count": "sum", "no-antibody_count": "sum"})
    .assign(
        min_antibody_count=lambda x: (
            (prob_escape_filters["min_antibody_frac"] * x["antibody_count"]).clip(
                lower=prob_escape_filters["min_antibody_count"],
            )
        ),
        min_no_antibody_count=lambda x: (
            (prob_escape_filters["min_no_antibody_frac"] * x["no-antibody_count"]).clip(
                lower=prob_escape_filters["min_no_antibody_count"],
            )
        ),
    )[["sample", "min_antibody_count", "min_no_antibody_count"]]
)
display(min_counts)
| sample | min_antibody_count | min_no_antibody_count | |
|---|---|---|---|
| 0 | B-231011-rescue_6_ultra-3BNC117-10.0-1 | 536.89854 | 20.0 | 
| 1 | B-231011-rescue_6_ultra-3BNC117-2.5-1 | 529.91768 | 20.0 | 
| 2 | B-231011-rescue_6_ultra-3BNC117-20.0-1 | 420.45620 | 20.0 | 
| 3 | B-231011-rescue_6_ultra-3BNC117-5.0-1 | 704.37410 | 20.0 | 
Now plot the distribution of no-antibody and antibody counts versus the thresholds. Recall we keep variants that meet either threshold, and in an ideal experiment all variants would meet the no-antibody threshold but we may expect only a small fraction (true escape mutations) to meet the antibody threshold.
In the plots below, the bars span the interquartile range, the lines go from min to max, the dark black line is the median, and the red line is the threshold for counts to be retained (a variant only needs to meet one threshold).
count_summary = (
    prob_escape.melt(
        id_vars=["sample", "concentration", "use_in_fit"],
        value_vars=["antibody_count", "no-antibody_count"],
        var_name="count_type",
        value_name="count",
    )
    .groupby(["sample", "concentration", "use_in_fit", "count_type"], as_index=False)
    .aggregate(
        median=pd.NamedAgg("count", "median"),
        q1=pd.NamedAgg("count", lambda s: s.quantile(0.25)),
        q3=pd.NamedAgg("count", lambda s: s.quantile(0.75)),
        min=pd.NamedAgg("count", "min"),
        max=pd.NamedAgg("count", "max"),
    )
    .merge(
        min_counts.rename(
            columns={
                "min_antibody_count": "antibody_count",
                "min_no_antibody_count": "no-antibody_count",
            }
        ).melt(id_vars="sample", var_name="count_type", value_name="threshold"),
        on=["sample", "count_type"],
        validate="one_to_one",
    )
)
base_chart = alt.Chart(count_summary).encode(
    y=alt.Y("sample", title=None, sort=alt.SortField("concentration")),
    tooltip=count_summary.columns.tolist(),
    color=alt.Color(
        "use_in_fit",
        scale=alt.Scale(domain=[True, False], range=["blue", "gray"]),
    ),
)
quantile_bar = base_chart.encode(
    x=alt.X(
        "q1",
        scale=alt.Scale(type="symlog", constant=20),
        axis=alt.Axis(labelOverlap=True),
        title="count",
    ),
    x2="q3",
).mark_bar(color="blue", height={"band": 0.8})
range_line = base_chart.encode(x="min", x2="max").mark_rule(color="blue", opacity=0.5)
median_line = base_chart.encode(
    x="median", x2="median", color=alt.value("black")
).mark_bar(xOffset=1, x2Offset=-1, height={"band": 0.8})
threshold_line = base_chart.encode(
    x="threshold", x2="threshold", color=alt.value("red")
).mark_bar(xOffset=1, x2Offset=-1, height={"band": 0.8})
count_summary_chart = (quantile_bar + range_line + median_line + threshold_line).facet(
    column=alt.Column(
        "count_type",
        title=None,
        sort="descending",
        header=alt.Header(labelFontWeight="bold", labelFontSize=12),
    ),
)
count_summary_chart
Classify which variants to retain:
prob_escape = (
    prob_escape.drop(
        columns=["min_no_antibody_count", "min_antibody_count"],
        errors="ignore",
    )
    .merge(min_counts, on="sample", validate="many_to_one")
    .assign(
        retain=lambda x: (
            (x["antibody_count"] >= x["min_antibody_count"])
            | (x["no-antibody_count"] >= x["min_no_antibody_count"])
        )
    )
)
Plot the fraction of all barcode counts and the fraction of all variants that are retained. We typically retain a higher fraction of barcode counts than variants, since the barcode counts are asymmetrically distributed toward some variants, which are more likely to be retained.
frac_retained = (
    prob_escape.melt(
        id_vars=["sample", "concentration", "use_in_fit", "retain", "barcode"],
        value_vars=["antibody_count", "no-antibody_count"],
        var_name="count_type",
        value_name="count",
    )
    .assign(retained_count=lambda x: x["count"] * x["retain"].astype(int))
    .groupby(["sample", "concentration", "use_in_fit", "count_type"], as_index=False)
    .aggregate(
        counts=pd.NamedAgg("count", "sum"),
        retained_counts=pd.NamedAgg("retained_count", "sum"),
        variants=pd.NamedAgg("barcode", "count"),
        retained_variants=pd.NamedAgg("retain", "sum"),
    )
    .assign(
        barcode_counts=lambda x: x["retained_counts"] / x["counts"],
        variants=lambda x: x["retained_variants"] / x["variants"],
    )
    .melt(
        id_vars=["sample", "concentration", "use_in_fit", "count_type"],
        value_vars=["variants", "barcode_counts"],
        var_name="frac_type",
        value_name="fraction_retained",
    )
)
frac_retained_chart = (
    alt.Chart(frac_retained)
    .encode(
        y=alt.Y("sample", title=None, sort=alt.SortField("concentration")),
        x=alt.X("fraction_retained", scale=alt.Scale(domain=[0, 1])),
        yOffset="count_type",
        color="count_type",
        opacity=alt.Opacity(
            "use_in_fit",
            scale=alt.Scale(domain=[True, False], range=[1, 0.4]),
        ),
        column=alt.Column(
            "frac_type",
            title=None,
            header=alt.Header(labelFontWeight="bold", labelFontSize=12),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3f") if c == "fraction_retained" else c
            for c in frac_retained.columns
        ],
    )
    .mark_bar()
    .properties(height=alt.Step(12), width=250)
)
frac_retained_chart
Probability (fraction) escape among retained variants¶
We now just analyze retained variants:
display(
    prob_escape.query("retain")
    .groupby(["sample", "concentration"])
    .aggregate(n_variants=pd.NamedAgg("barcode", "nunique"))
)
| n_variants | ||
|---|---|---|
| sample | concentration | |
| B-231011-rescue_6_ultra-3BNC117-10.0-1 | 10.0 | 62188 | 
| B-231011-rescue_6_ultra-3BNC117-2.5-1 | 2.5 | 61888 | 
| B-231011-rescue_6_ultra-3BNC117-20.0-1 | 20.0 | 65377 | 
| B-231011-rescue_6_ultra-3BNC117-5.0-1 | 5.0 | 61889 | 
Get mean probability of escape across all variants with the indicated number of mutations. Note we weight each retained variant equally regardless of how many barcode counts it has. We plot means for both the censored (set to between 0 and 1)and uncensored prob escape. Note that the plot uses a symlog scale for the y-axis. Mouseover points for details.
max_aa_subs = prob_escape_filters["max_aa_subs"]
mean_prob_escape = (
    prob_escape.query("retain")
    .assign(
        n_substitutions=lambda x: (
            x["aa_substitutions"]
            .str.split()
            .map(len)
            .clip(upper=max_aa_subs)
            .map(lambda n: str(n) if n < max_aa_subs else f">{int(max_aa_subs - 1)}")
        ),
        prob_escape_uncensored=lambda x: x["prob_escape_uncensored"].clip(
            upper=prob_escape_filters["clip_uncensored_prob_escape"],
        ),
    )
    .groupby(
        ["sample", "concentration", "use_in_fit", "n_substitutions"], as_index=False
    )
    .aggregate(
        prob_escape=pd.NamedAgg("prob_escape", "mean"),
        prob_escape_uncensored=pd.NamedAgg("prob_escape_uncensored", "mean"),
        n_variants=pd.NamedAgg("barcode", "count"),
    )
    .rename(
        columns={
            "prob_escape": "censored to [0, 1]",
            "prob_escape_uncensored": "not censored",
        }
    )
    .melt(
        id_vars=[
            "sample",
            "concentration",
            "use_in_fit",
            "n_substitutions",
            "n_variants",
        ],
        var_name="censored",
        value_name="probability escape",
    )
)
print(f"Writing mean prob escape for samples used in fit to {prob_escape_mean_csv}")
mean_prob_escape.to_csv(prob_escape_mean_csv, index=False, float_format="%.4g")
mean_prob_escape_chart = (
    alt.Chart(mean_prob_escape)
    .encode(
        x=alt.X(
            "concentration",
            **(
                {"title": assay_config["concentration_title"]}
                if "concentration_title" in assay_config
                else {}
            ),
            scale=alt.Scale(
                **(
                    assay_config["concentration_scale"]
                    if "concentration_scale" in assay_config
                    else {"type": "log"}
                )
            ),
        ),
        y=alt.Y(
            "probability escape",
            scale=alt.Scale(**assay_config["prob_escape_scale"]),
        ),
        column=alt.Column(
            "censored",
            title=None,
            header=alt.Header(labelFontWeight="bold", labelFontSize=12),
        ),
        color=alt.Color("n_substitutions"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c == "probability escape" else c
            for c in mean_prob_escape.columns
        ],
        shape=alt.Shape("use_in_fit", scale=alt.Scale(domain=[True, False])),
    )
    .mark_line(point=True, size=0.75, opacity=0.8)
    .properties(width=220, height=140)
    .configure_axis(grid=False)
    .configure_point(size=50)
)
mean_prob_escape_chart
Writing mean prob escape for samples used in fit to results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1_prob_escape_mean.csv
Fit polyclonal model¶
Fit the model. If there is more than one epitope, we fit models with fewer epitopes too:
# first build up arguments used to specify fitting
n_epitopes = params["polyclonal_params"]["n_epitopes"]
spatial_distances = params["polyclonal_params"]["spatial_distances"]
fit_kwargs = params["polyclonal_params"]["fit_kwargs"]
escape_plot_kwargs = params["escape_plot_kwargs"]
plot_hide_stats = params["plot_hide_stats"]
site_numbering_map = pd.read_csv(site_numbering_map_csv).sort_values("sequential_site")
assert site_numbering_map[["sequential_site", "reference_site"]].notnull().all().all()
if "addtl_slider_stats" not in escape_plot_kwargs:
    escape_plot_kwargs["addtl_slider_stats"] = {}
if "addtl_slider_stats_hide_not_filter" not in escape_plot_kwargs:
    escape_plot_kwargs["addtl_slider_stats_hide_not_filter"] = []
escape_plot_kwargs["df_to_merge"] = []
for stat, stat_d in plot_hide_stats.items():
    escape_plot_kwargs["addtl_slider_stats"][stat] = stat_d["init"]
    escape_plot_kwargs["addtl_slider_stats_hide_not_filter"].append(stat)
    merge_df = pd.read_csv(stat_d["csv"]).rename(columns={stat_d["csv_col"]: stat})
    if "min_filters" in stat_d:
        for col, col_min in stat_d["min_filters"].items():
            if col not in merge_df.columns:
                raise ValueError(f"{stat=} CSV lacks {col=}\n{merge_df.columns=}")
            merge_df = merge_df[merge_df[col] >= col_min]
    escape_plot_kwargs["df_to_merge"].append(merge_df[["site", "mutant", stat]])
addtl_site_cols = [
    c
    for c in site_numbering_map.columns
    if c.endswith("site") and c != "reference_site"
]
escape_plot_kwargs["df_to_merge"].append(
    site_numbering_map.rename(columns={"reference_site": "site"})[
        ["site", *addtl_site_cols, "region"]
    ]
)
if "addtl_tooltip_stats" not in escape_plot_kwargs:
    escape_plot_kwargs["addtl_tooltip_stats"] = []
for c in addtl_site_cols:
    if c not in escape_plot_kwargs["addtl_tooltip_stats"]:
        escape_plot_kwargs["addtl_tooltip_stats"].append(c)
escape_plot_kwargs["scale_stat_col"] = assay_config["scale_stat"]
if assay_config["stat_name"] != "escape":
    escape_plot_kwargs["rename_stat_col"] = assay_config["stat_name"]
if spatial_distances is not None:
    print(f"Reading spatial distances from {spatial_distances}")
    spatial_distances = pd.read_csv(spatial_distances)
    print(f"Read spatial distances for {len(spatial_distances)} residue pairs")
# now fit the models
for n in range(1, n_epitopes + 1):
    print(f"\n\nFitting a model for {n} epitopes")
    model = polyclonal.Polyclonal(
        n_epitopes=n,
        data_to_fit=(
            prob_escape.query("retain").query("use_in_fit")[
                ["aa_substitutions", "concentration", "prob_escape"]
            ]
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
        spatial_distances=spatial_distances,
        sites=site_numbering_map["reference_site"],
    )
    opt_res = model.fit(**fit_kwargs)
    print("Here is the neutralization curve:")
    display(model.curves_plot())
    print("Here is the mutation-effect plot:")
    display(model.mut_escape_plot(**escape_plot_kwargs))
print(f"\n\nWriting the {n} epitope model to {pickle_file}")
with open(pickle_file, "wb") as f:
    pickle.dump(model, f)
Reading spatial distances from results/spatial_distances/5FYK.csv Read spatial distances for 188805 residue pairs Fitting a model for 1 epitopes
#
# Fitting site-level fixed Hill coefficient and non-neutralized frac model.
# Starting optimization of 865 parameters at Sat Jan 18 13:54:57 2025.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity reg_hill_coefficient reg_non_neutralized_frac
           0    0.045682       14850       14818           0           0           0              0               0       32.409                    0                        0
99 5.3546 8351.1 8252.1 21.252 0 61.072 0 0 16.739 0 0
# Successfully finished at Sat Jan 18 13:55:03 2025. # # Fitting fixed Hill coefficient and non-neutralized frac model. # Starting optimization of 12282 parameters at Sat Jan 18 13:55:03 2025.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity reg_hill_coefficient reg_non_neutralized_frac
           0     0.17382       17205       16848      279.34   7.221e-32      61.072              0               0         16.5                    0                        0
200 39.902 15930 15629 210.78 4.5738 65.44 0 0 20.653 0 0
341 68.174 15927 15622 212.81 4.8719 66.783 0 0 20.642 0 0
# Successfully finished at Sat Jan 18 13:56:11 2025.
#
# Fitting model.
# Starting optimization of 12284 parameters at Sat Jan 18 13:56:11 2025.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity reg_hill_coefficient reg_non_neutralized_frac
           0     0.16569       15908       15622      212.81      4.8719      66.783              0               0       2.0642                    0                        0
200 40.615 13372 12826 268.03 4.5575 118.32 0 0 0.86055 154.2 0.13052
400 79.37 13347 12788 270.66 4.5098 120.01 0 0 0.85663 162.82 0.15015
439 87.453 13344 12775 272.11 4.5292 121.68 0 0 0.85272 170.12 0.16227
# Successfully finished at Sat Jan 18 13:57:39 2025. Here is the neutralization curve:
Here is the mutation-effect plot:
Writing the 1 epitope model to results/antibody_escape/by_selection/B-231011-rescue_6_ultra-3BNC117-1_polyclonal_model.pickle