ACE2 binding effects of non-RBD mutations in natural sequences¶
Look at ACE2 binding effects of mutations in non-RBD sequences.
In [1]:
# this cell is tagged as `parameters` for `papermill` parameterization
dms_summary_csv = None
pango_consensus_seqs_json = None
In [2]:
# Parameters
pango_consensus_seqs_json = (
"results/compare_natural/pango-consensus-sequences_summary.json"
)
xbb15_dms_csv = "results/summaries/summary.csv"
ba2_dms_csv = "https://raw.githubusercontent.com/dms-vep/SARS-CoV-2_Omicron_BA.2_spike_ACE2_binding/main/results/summaries/summary.csv"
In [3]:
import collections
import json
import altair as alt
import numpy
import pandas as pd
Get spike mutations relative to reference and new spike mutations relative to parent in Pango clades descended from starting clades:
In [4]:
starting_clades = ["XBB", "BA.2", "BA.5", "BA.2.86"]
with open(pango_consensus_seqs_json) as f:
pango_clades = json.load(f)
def build_records(c, recs):
"""Build records of Pango clade information."""
if c in recs["clade"]:
return
recs["clade"].append(c)
recs["date"].append(pango_clades[c]["designationDate"])
recs["new_spike_muts"].append(
[
mut.split(":")[1]
for field in ["aaSubstitutionsNew", "aaDeletionsNew"]
for mut in pango_clades[c][field]
if mut.startswith("S:")
]
)
recs["spike_muts"].append(
[
mut.split(":")[1]
for field in ["aaSubstitutions", "aaDeletions"]
for mut in pango_clades[c][field]
if mut.startswith("S:")
]
)
for c_child in pango_clades[c]["children"]:
build_records(c_child, recs)
pango_dfs = []
for starting_clade in starting_clades:
records = collections.defaultdict(list)
build_records(starting_clade, records)
pango_dfs.append(
pd.DataFrame(records)
.query("clade != @starting_clade")
.assign(parent_clade=starting_clade)
)
pango_df = pd.concat(pango_dfs)
Get the counts of how many times each mutation newly occurs in a clade:
In [5]:
new_mut_counts = (
pango_df
.explode("new_spike_muts")
.query("new_spike_muts.notnull()")
.rename(columns={"new_spike_muts": "mutation"})
.groupby(["parent_clade", "mutation"], as_index=False)
.aggregate(
n_clades=pd.NamedAgg("clade", "count"),
clades=pd.NamedAgg("clade", "unique"),
)
.assign(
site=lambda x: x["mutation"].str[1: -1].astype(int),
clades=lambda x: x["clades"].map(lambda s: "; ".join(s)),
)
)
Add DMS phenotypes:
In [6]:
xbb15_dms = pd.read_csv(xbb15_dms_csv).rename(
columns={
"spike mediated entry": "cell entry",
"human sera escape": "sera escape",
}
)
ba2_dms = pd.read_csv(ba2_dms_csv).rename(
columns={
"spike mediated entry": "cell entry",
"human sera escape": "sera escape",
"ACE2 affinity": "ACE2 binding",
}
)
# specify DMS phenotypes of interest
phenotypes = [
# "sera escape",
"ACE2 binding",
"cell entry",
]
assert set(phenotypes).issubset(xbb15_dms.columns)
assert set(phenotypes).issubset(ba2_dms.columns)
def mut_dms(m, dms_data, site_to_region, dms_wt):
"""Get DMS phenotypes for a mutation."""
null_d = {k: pd.NA for k in phenotypes}
if pd.isnull(m) or int(m[1: -1]) not in dms_wt:
d = null_d
d["is_RBD"] = pd.NA
else:
parent = m[0]
site = int(m[1: -1])
mut = m[-1]
wt = dms_wt[site]
if parent == wt:
try:
d = dms_data[(site, parent, mut)]
except KeyError:
d = null_d
elif mut == wt:
try:
d = {k: -v for (k, v) in dms_data[(site, mut, parent)].items()}
except KeyError:
d = null_d
else:
try:
parent_d = dms_data[(site, wt, parent)]
mut_d = dms_data[(site, wt, mut)]
d = {p: mut_d[p] - parent_d[p] for p in phenotypes}
except KeyError:
d = null_d
d["is_RBD"] = (site_to_region[site] == "RBD")
assert list(d) == phenotypes + ["is_RBD"]
return d
site_to_region = {}
for strain, dms_df in [("XBB.1.5", xbb15_dms), ("BA.2", ba2_dms)]:
# dict that maps site to wildtype in DMS
dms_wt = dms_df.set_index("site")["wildtype"].to_dict()
# dict that maps site to region in DMS
site_to_region = dms_df.set_index("site")["region"].to_dict()
dms_data = (
dms_df
.set_index(["site", "wildtype", "mutant"])
[phenotypes]
.to_dict(orient="index")
)
site_to_region.update(dms_df.set_index("site")["region"].to_dict())
for phenotype in phenotypes:
new_mut_counts[f"{strain} {phenotype}"] = new_mut_counts["mutation"].map(
lambda m: mut_dms(m, dms_data, site_to_region, dms_wt)[phenotype]
)
new_mut_counts["region"] = new_mut_counts["site"].map(site_to_region)
Look at non-RBD mutations observed at least 3 times in XBB-descended clades that substantially increase ACE2 binding:
In [7]:
pd.set_option("display.max_colwidth", 1000)
display(
new_mut_counts
.sort_values("XBB.1.5 ACE2 binding", ascending=False)
.query("parent_clade == 'XBB'")
.query("region != 'RBD'")
.query("n_clades >= 3")
.query("`XBB.1.5 ACE2 binding` >= 0.1")
.reset_index(drop=True)
)
| parent_clade | mutation | n_clades | clades | site | XBB.1.5 ACE2 binding | XBB.1.5 cell entry | BA.2 ACE2 binding | BA.2 cell entry | region | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | XBB | T572I | 9 | XBB.1.5.44; GK.3.2; FL.36; GA.4.1.2; FY.1.2; FY.5.1; FY.5.4; FY.5.5.1; FY.8 | 572 | 0.4321 | 0.1548 | 0.3777 | -0.04124 | other |
| 1 | XBB | T732I | 6 | HK.3.9; JG.1; XBB.1.16.7; XBB.1.16.8; XBB.1.38; GE.1.3 | 732 | 0.3729 | 0.06165 | 0.05763 | 0.245 | S2 |
| 2 | XBB | F157L | 5 | EU.1.1.3; HC.2; HK.3.1; HK.19; EG.5.1.6 | 157 | 0.2894 | -0.107 | 0.008524 | 0.04201 | NTD |
| 3 | XBB | S704L | 9 | XBB.1.5.58; GK.1; JG.3; HV.1.6.1; EG.5.2.1; EG.5.2.3; GA.4.1; KE.2; FY.4.1.1 | 704 | 0.2691 | 0.07445 | 0.3364 | -0.1748 | S2 |
| 4 | XBB | S640F | 3 | XBB.1.2; EG.1.8; EG.5 | 640 | 0.1953 | -0.09004 | 0.1969 | -0.6828 | other |
| 5 | XBB | T307I | 3 | FL.2.2.1; HK.1.2; JG.3.1 | 307 | 0.1813 | -0.2222 | 0.01207 | -0.2601 | other |
| 6 | XBB | E583D | 4 | FZ.1; JD.1.1.2; FY.1.3; HL.1 | 583 | 0.1708 | -0.8333 | 0.1246 | -1.157 | other |
| 7 | XBB | Q52H | 6 | FT.3; FL.20; EG.5.1; XBB.1.16.12; JM.1; XBB.2.10 | 52 | 0.1697 | 0.07384 | 0.05101 | 0.1993 | NTD |
| 8 | XBB | Q675K | 3 | EK.4; XBB.1.16.28; FY.9 | 675 | 0.1302 | -0.05602 | -0.2876 | 0.08245 | other |
| 9 | XBB | G181R | 3 | FL.10.3; HK.3.6; GM.2 | 181 | 0.1072 | 0.00935 | -0.07778 | -0.03091 | NTD |
Same for BA.2 and BA.5 descended clades:
In [8]:
pd.set_option("display.max_colwidth", 1000)
display(
new_mut_counts
.sort_values("BA.2 ACE2 binding", ascending=False)
.query("parent_clade in ['BA.5', 'BA.2']")
.query("region != 'RBD'")
.query("n_clades >= 3")
.query("`BA.2 ACE2 binding` >= 0.1")
.reset_index(drop=True)
)
| parent_clade | mutation | n_clades | clades | site | XBB.1.5 ACE2 binding | XBB.1.5 cell entry | BA.2 ACE2 binding | BA.2 cell entry | region | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BA.2 | T572I | 8 | BM.1.1.4; DS.2; JN.1.1.1; JN.1.4.3; JN.1.7; JN.1.8.1; JN.1.9.1; JN.2.2 | 572 | 0.4321 | 0.1548 | 0.3777 | -0.04124 | other |
| 1 | BA.5 | S704L | 3 | BF.11.4; DN.1.1.3; BQ.1.1.42 | 704 | 0.2691 | 0.07445 | 0.3364 | -0.1748 | S2 |
| 2 | BA.2 | H69- | 7 | BP.1; DD.1; BA.2.3.22; BA.2.38.3; BA.2.77; BA.2.86; BA.2.87.1 | 69 | 0.05422 | 0.08117 | 0.265 | 0.04309 | NTD |
| 3 | BA.2 | D936H | 3 | CJ.1.3.2; BN.1.11; BA.2.77 | 936 | 0.04162 | -0.0695 | 0.2327 | -0.05494 | S2 |
| 4 | BA.5 | D253G | 3 | BQ.1.1.2; BQ.1.1.44; FB.1 | 253 | -0.1234 | -0.1294 | 0.181 | -0.02794 | NTD |
| 5 | BA.5 | K182E | 6 | BF.7.3; DN.3.1; BQ.1.1.39; BQ.1.1.71; BQ.1.1.74; JH.1 | 182 | -0.1553 | 0.04154 | 0.1602 | -0.0495 | NTD |
| 6 | BA.5 | K147N | 7 | DJ.1.1.1; BF.5.5; BF.11.2; BU.3; DQ.1; DN.1; BQ.1.1.66 | 147 | -0.008214 | 0.03312 | 0.1291 | 0.02618 | NTD |
Also look at how well libraries cover mutations of interest by seeing what fraction of all new mutations in descendant clades are covered well enough to make viral entry estimates for both the BA.2 and XBB libraries and those parent clades:
In [9]:
has_measurement_df = (
new_mut_counts
.query("region.notnull()") # excludes cytoplasmic tail
.assign(
has_entry_measurement=lambda x: numpy.where(
x["parent_clade"] == "BA.2",
x["BA.2 cell entry"].notnull(),
x["XBB.1.5 cell entry"].notnull(),
),
n_clades=lambda x: x["n_clades"].clip(upper=6),
)
[["parent_clade", "mutation", "n_clades", "has_entry_measurement"]]
)
print("Mutations that appear in any Pango clades that have measurements")
display(
has_measurement_df
.assign(in_multiple_clades=lambda x: x["n_clades"] > 1)
.groupby(["parent_clade", "in_multiple_clades"])
.aggregate(
n_mutations=pd.NamedAgg("mutation", "count"),
has_entry_measurement=pd.NamedAgg("has_entry_measurement", "sum"),
)
)
has_measurement_chart = (
alt.Chart(has_measurement_df)
.encode(
x=alt.X(
"n_clades",
bin=alt.BinParams(step=1),
),
y=alt.Y("count()"),
color="has_entry_measurement",
column="parent_clade",
tooltip=["n_clades", "count()"],
)
.mark_bar()
)
has_measurement_chart
Mutations that appear in any Pango clades that have measurements
| n_mutations | has_entry_measurement | ||
|---|---|---|---|
| parent_clade | in_multiple_clades | ||
| BA.2 | False | 171 | 147 |
| True | 86 | 85 | |
| BA.2.86 | False | 19 | 18 |
| True | 12 | 12 | |
| BA.5 | False | 111 | 109 |
| True | 57 | 56 | |
| XBB | False | 217 | 211 |
| True | 108 | 107 |
Out[9]:
In [ ]: