ACE2 binding effects of non-RBD mutations in natural sequences¶

Look at ACE2 binding effects of mutations in non-RBD sequences.

In [1]:
# this cell is tagged as `parameters` for `papermill` parameterization
dms_summary_csv = None
pango_consensus_seqs_json = None
In [2]:
# Parameters
pango_consensus_seqs_json = (
    "results/compare_natural/pango-consensus-sequences_summary.json"
)
xbb15_dms_csv = "results/summaries/summary.csv"
ba2_dms_csv = "https://raw.githubusercontent.com/dms-vep/SARS-CoV-2_Omicron_BA.2_spike_ACE2_binding/main/results/summaries/summary.csv"
In [3]:
import collections
import json

import altair as alt

import numpy

import pandas as pd

Get spike mutations relative to reference and new spike mutations relative to parent in Pango clades descended from starting clades:

In [4]:
starting_clades = ["XBB", "BA.2", "BA.5", "BA.2.86"]

with open(pango_consensus_seqs_json) as f:
    pango_clades = json.load(f)

def build_records(c, recs):
    """Build records of Pango clade information."""
    if c in recs["clade"]:
        return
    recs["clade"].append(c)
    recs["date"].append(pango_clades[c]["designationDate"])
    recs["new_spike_muts"].append(
        [
            mut.split(":")[1]
            for field in ["aaSubstitutionsNew", "aaDeletionsNew"]
            for mut in pango_clades[c][field]
            if mut.startswith("S:")
        ]
    )
    recs["spike_muts"].append(
        [
            mut.split(":")[1]
            for field in ["aaSubstitutions", "aaDeletions"]
            for mut in pango_clades[c][field]
            if mut.startswith("S:")
        ]
    )
    for c_child in pango_clades[c]["children"]:
        build_records(c_child, recs)
        
pango_dfs = []
for starting_clade in starting_clades:
    records = collections.defaultdict(list)
    build_records(starting_clade, records)
    pango_dfs.append(
        pd.DataFrame(records)
        .query("clade != @starting_clade")
        .assign(parent_clade=starting_clade)
    )

pango_df = pd.concat(pango_dfs)

Get the counts of how many times each mutation newly occurs in a clade:

In [5]:
new_mut_counts = (
    pango_df
    .explode("new_spike_muts")
    .query("new_spike_muts.notnull()")
    .rename(columns={"new_spike_muts": "mutation"})
    .groupby(["parent_clade", "mutation"], as_index=False)
    .aggregate(
        n_clades=pd.NamedAgg("clade", "count"),
        clades=pd.NamedAgg("clade", "unique"),
    )
    .assign(
        site=lambda x: x["mutation"].str[1: -1].astype(int),
        clades=lambda x: x["clades"].map(lambda s: "; ".join(s)),
    )
)

Add DMS phenotypes:

In [6]:
xbb15_dms = pd.read_csv(xbb15_dms_csv).rename(
    columns={
        "spike mediated entry": "cell entry",
        "human sera escape": "sera escape",
    }
)

ba2_dms = pd.read_csv(ba2_dms_csv).rename(
    columns={
        "spike mediated entry": "cell entry",
        "human sera escape": "sera escape",
        "ACE2 affinity": "ACE2 binding",
    }
)

# specify DMS phenotypes of interest
phenotypes = [
#    "sera escape",
    "ACE2 binding",
    "cell entry",
]
assert set(phenotypes).issubset(xbb15_dms.columns)
assert set(phenotypes).issubset(ba2_dms.columns)

def mut_dms(m, dms_data, site_to_region, dms_wt):
    """Get DMS phenotypes for a mutation."""
    null_d = {k: pd.NA for k in phenotypes}
    if pd.isnull(m) or int(m[1: -1]) not in dms_wt:
        d = null_d
        d["is_RBD"] = pd.NA
    else:
        parent = m[0]
        site = int(m[1: -1])
        mut = m[-1]
        wt = dms_wt[site]
        if parent == wt:
            try:
                d = dms_data[(site, parent, mut)]
            except KeyError:
                d = null_d
        elif mut == wt:
            try:
                d = {k: -v for (k, v) in dms_data[(site, mut, parent)].items()}
            except KeyError:
                d = null_d
        else:
            try:
                parent_d = dms_data[(site, wt, parent)]
                mut_d = dms_data[(site, wt, mut)]
                d = {p: mut_d[p] - parent_d[p] for p in phenotypes}
            except KeyError:
                d = null_d
        d["is_RBD"] = (site_to_region[site] == "RBD")
    assert list(d) == phenotypes + ["is_RBD"]
    return d

site_to_region = {}

for strain, dms_df in [("XBB.1.5", xbb15_dms), ("BA.2", ba2_dms)]:

    # dict that maps site to wildtype in DMS
    dms_wt = dms_df.set_index("site")["wildtype"].to_dict()
    
    # dict that maps site to region in DMS
    site_to_region = dms_df.set_index("site")["region"].to_dict()
    
    dms_data = (
        dms_df
        .set_index(["site", "wildtype", "mutant"])
        [phenotypes]
        .to_dict(orient="index")
    )

    site_to_region.update(dms_df.set_index("site")["region"].to_dict())

    for phenotype in phenotypes:
        new_mut_counts[f"{strain} {phenotype}"] = new_mut_counts["mutation"].map(
            lambda m: mut_dms(m, dms_data, site_to_region, dms_wt)[phenotype]
        )

new_mut_counts["region"] = new_mut_counts["site"].map(site_to_region)

Look at non-RBD mutations observed at least 3 times in XBB-descended clades that substantially increase ACE2 binding:

In [7]:
pd.set_option("display.max_colwidth", 1000)

display(
    new_mut_counts
    .sort_values("XBB.1.5 ACE2 binding", ascending=False)
    .query("parent_clade == 'XBB'")
    .query("region != 'RBD'")
    .query("n_clades >= 3")
    .query("`XBB.1.5 ACE2 binding` >= 0.1")
    .reset_index(drop=True)
)
parent_clade mutation n_clades clades site XBB.1.5 ACE2 binding XBB.1.5 cell entry BA.2 ACE2 binding BA.2 cell entry region
0 XBB T572I 9 XBB.1.5.44; GK.3.2; FL.36; GA.4.1.2; FY.1.2; FY.5.1; FY.5.4; FY.5.5.1; FY.8 572 0.4321 0.1548 0.3777 -0.04124 other
1 XBB T732I 6 HK.3.9; JG.1; XBB.1.16.7; XBB.1.16.8; XBB.1.38; GE.1.3 732 0.3729 0.06165 0.05763 0.245 S2
2 XBB F157L 5 EU.1.1.3; HC.2; HK.3.1; HK.19; EG.5.1.6 157 0.2894 -0.107 0.008524 0.04201 NTD
3 XBB S704L 9 XBB.1.5.58; GK.1; JG.3; HV.1.6.1; EG.5.2.1; EG.5.2.3; GA.4.1; KE.2; FY.4.1.1 704 0.2691 0.07445 0.3364 -0.1748 S2
4 XBB S640F 3 XBB.1.2; EG.1.8; EG.5 640 0.1953 -0.09004 0.1969 -0.6828 other
5 XBB T307I 3 FL.2.2.1; HK.1.2; JG.3.1 307 0.1813 -0.2222 0.01207 -0.2601 other
6 XBB E583D 4 FZ.1; JD.1.1.2; FY.1.3; HL.1 583 0.1708 -0.8333 0.1246 -1.157 other
7 XBB Q52H 6 FT.3; FL.20; EG.5.1; XBB.1.16.12; JM.1; XBB.2.10 52 0.1697 0.07384 0.05101 0.1993 NTD
8 XBB Q675K 3 EK.4; XBB.1.16.28; FY.9 675 0.1302 -0.05602 -0.2876 0.08245 other
9 XBB G181R 3 FL.10.3; HK.3.6; GM.2 181 0.1072 0.00935 -0.07778 -0.03091 NTD

Same for BA.2 and BA.5 descended clades:

In [8]:
pd.set_option("display.max_colwidth", 1000)

display(
    new_mut_counts
    .sort_values("BA.2 ACE2 binding", ascending=False)
    .query("parent_clade in ['BA.5', 'BA.2']")
    .query("region != 'RBD'")
    .query("n_clades >= 3")
    .query("`BA.2 ACE2 binding` >= 0.1")
    .reset_index(drop=True)
)
parent_clade mutation n_clades clades site XBB.1.5 ACE2 binding XBB.1.5 cell entry BA.2 ACE2 binding BA.2 cell entry region
0 BA.2 T572I 8 BM.1.1.4; DS.2; JN.1.1.1; JN.1.4.3; JN.1.7; JN.1.8.1; JN.1.9.1; JN.2.2 572 0.4321 0.1548 0.3777 -0.04124 other
1 BA.5 S704L 3 BF.11.4; DN.1.1.3; BQ.1.1.42 704 0.2691 0.07445 0.3364 -0.1748 S2
2 BA.2 H69- 7 BP.1; DD.1; BA.2.3.22; BA.2.38.3; BA.2.77; BA.2.86; BA.2.87.1 69 0.05422 0.08117 0.265 0.04309 NTD
3 BA.2 D936H 3 CJ.1.3.2; BN.1.11; BA.2.77 936 0.04162 -0.0695 0.2327 -0.05494 S2
4 BA.5 D253G 3 BQ.1.1.2; BQ.1.1.44; FB.1 253 -0.1234 -0.1294 0.181 -0.02794 NTD
5 BA.5 K182E 6 BF.7.3; DN.3.1; BQ.1.1.39; BQ.1.1.71; BQ.1.1.74; JH.1 182 -0.1553 0.04154 0.1602 -0.0495 NTD
6 BA.5 K147N 7 DJ.1.1.1; BF.5.5; BF.11.2; BU.3; DQ.1; DN.1; BQ.1.1.66 147 -0.008214 0.03312 0.1291 0.02618 NTD

Also look at how well libraries cover mutations of interest by seeing what fraction of all new mutations in descendant clades are covered well enough to make viral entry estimates for both the BA.2 and XBB libraries and those parent clades:

In [9]:
has_measurement_df = (
    new_mut_counts
    .query("region.notnull()")  # excludes cytoplasmic tail
    .assign(
        has_entry_measurement=lambda x: numpy.where(
            x["parent_clade"] == "BA.2",
            x["BA.2 cell entry"].notnull(),
            x["XBB.1.5 cell entry"].notnull(),
        ),
        n_clades=lambda x: x["n_clades"].clip(upper=6),
    )
    [["parent_clade", "mutation", "n_clades", "has_entry_measurement"]]
)

print("Mutations that appear in any Pango clades that have measurements")
display(
    has_measurement_df
    .assign(in_multiple_clades=lambda x: x["n_clades"] > 1)
    .groupby(["parent_clade", "in_multiple_clades"])
    .aggregate(
        n_mutations=pd.NamedAgg("mutation", "count"),
        has_entry_measurement=pd.NamedAgg("has_entry_measurement", "sum"),
    )
)

has_measurement_chart = (
    alt.Chart(has_measurement_df)
    .encode(
        x=alt.X(
            "n_clades",
            bin=alt.BinParams(step=1),
        ),
        y=alt.Y("count()"),
        color="has_entry_measurement",
        column="parent_clade",
        tooltip=["n_clades", "count()"],
    )
    .mark_bar()
)

has_measurement_chart
Mutations that appear in any Pango clades that have measurements
n_mutations has_entry_measurement
parent_clade in_multiple_clades
BA.2 False 171 147
True 86 85
BA.2.86 False 19 18
True 12 12
BA.5 False 111 109
True 57 56
XBB False 217 211
True 108 107
Out[9]:
In [ ]: